In [1]:
#patching segment_anything.mask_decoder to support 5‑D src tensors
import glob, re, os

cands = glob.glob("/usr/local/lib/python3.*/dist-packages/segment_anything/modeling/mask_decoder.py")
if not cands:
    raise FileNotFoundError("Can't find segment_anything/modeling/mask_decoder.py")
decoder = cands[0]
print("Patching:", decoder)

#read & patch
with open(decoder, "r") as f:
    lines = f.readlines()

patched = []
for line in lines:
    #replace 4‑D unpack with 5‑D + collapse
    if re.match(r"\s*b, c, h, w = src\.shape", line):
        indent = line[:line.index("b")]
        patched += [
            f"{indent}# accommodate extra token dim\n",
            f"{indent}b, t, c, h, w = src.shape\n",
            f"{indent}src = src.view(b * t, c, h, w)\n",
            f"{indent}pos_src = pos_src.view(b * t, c, h, w)\n",
        ]
    else:
        patched.append(line)

final = []
for line in patched:
    if line.strip() == "return masks, iou_pred":
        indent = line[:line.index("return")]
        final += [
            f"{indent}# reshape masks and iou back to [B, T, …]\n",
            f"{indent}masks = masks.view(b, t, 1, h, w)\n",
            f"{indent}iou_pred = iou_pred.view(b, t)\n",
        ]
        final.append(line)
    else:
        final.append(line)

with open(decoder, "w") as f:
    f.writelines(final)

print("Patched segment_anything.mask_decoder – now restart the kernel.")


Patching: /usr/local/lib/python3.11/dist-packages/segment_anything/modeling/mask_decoder.py
Patched segment_anything.mask_decoder – now restart the kernel.


In [14]:
PROJECT_PATH = "/kaggle/input/task-aware-sam-lora/task_aware_sam_lora"
import sys, os
sys.path.append(PROJECT_PATH)

!pip install -q transformers accelerate opencv-python matplotlib wandb datasets fsspec==2023.9.2 pycocotools

SAM_CKPT_PATH = "/kaggle/working/sam_vit_h_4b8939.pth"
if not os.path.exists(SAM_CKPT_PATH):
    !wget -q https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth -P /kaggle/working/


In [15]:
import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import cv2
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")


Using device: cuda
GPU: Tesla T4
GPU Memory: 14.7 GB


In [16]:
from src.models.hypernetwork import TaskAwareHyperNet, create_sam_lora_config
from src.models.lora_adapter import LoRAAdapter, LoRAConfig
from src.models.sam_wrapper import SAMWithLoRA, TaskAwareSAM
from src.data.dataset import TaskAwareDataset
from src.utils.text_processing import TaskDescriptionProcessor
from config.train_config import get_t4_optimized_config


In [4]:
#config initialization
#load config
base_cfg   = get_t4_optimized_config()
model_cfg  = base_cfg.model
system_cfg = base_cfg.system

#LoRA adapter config
lora_cfg = LoRAConfig(
    rank=model_cfg.lora_rank,
    alpha=model_cfg.lora_alpha,
    dropout=model_cfg.lora_dropout,
    target_modules=model_cfg.lora_target_modules
)

#SAM + LoRA
sam_model = SAMWithLoRA(
    sam_checkpoint=SAM_CKPT_PATH,
    model_type=model_cfg.sam_model_type, 
    lora_config=lora_cfg,
    device=device
)

#hypernetwork
lora_config_dict = create_sam_lora_config(sam_model.sam.mask_decoder)
hypernet = TaskAwareHyperNet(
    lora_config=lora_config_dict,
    lora_rank=model_cfg.lora_rank,
    text_encoder_model=model_cfg.text_encoder_model,
    hidden_dim=model_cfg.hypernetwork_hidden_dim,
    num_layers=model_cfg.hypernetwork_num_layers,
    num_heads=model_cfg.hypernetwork_num_heads,
    dropout=model_cfg.hypernetwork_dropout
).to(device)

text_processor = TaskDescriptionProcessor()


Frozen SAM encoders
Added LoRA to: output_hypernetworks_mlps.0.layers.0 (as output_hypernetworks_mlps_0_layers_0)
Added LoRA to: output_hypernetworks_mlps.0.layers.1 (as output_hypernetworks_mlps_0_layers_1)
Added LoRA to: output_hypernetworks_mlps.0.layers.2 (as output_hypernetworks_mlps_0_layers_2)
Added LoRA to: output_hypernetworks_mlps.1.layers.0 (as output_hypernetworks_mlps_1_layers_0)
Added LoRA to: output_hypernetworks_mlps.1.layers.1 (as output_hypernetworks_mlps_1_layers_1)
Added LoRA to: output_hypernetworks_mlps.1.layers.2 (as output_hypernetworks_mlps_1_layers_2)
Added LoRA to: output_hypernetworks_mlps.2.layers.0 (as output_hypernetworks_mlps_2_layers_0)
Added LoRA to: output_hypernetworks_mlps.2.layers.1 (as output_hypernetworks_mlps_2_layers_1)
Added LoRA to: output_hypernetworks_mlps.2.layers.2 (as output_hypernetworks_mlps_2_layers_2)
Added LoRA to: output_hypernetworks_mlps.3.layers.0 (as output_hypernetworks_mlps_3_layers_0)
Added LoRA to: output_hypernetworks_mlps

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

2025-07-19 04:34:04.626799: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752899644.848224      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752899644.909247      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

TaskAwareHyperNet initialized with 26128 LoRA parameters
  output_hypernetworks_mlps.0.layers.0: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.0.layers.1: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.0.layers.2: 256x32, LoRA A: 1024, LoRA B: 128
  output_hypernetworks_mlps.1.layers.0: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.1.layers.1: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.1.layers.2: 256x32, LoRA A: 1024, LoRA B: 128
  output_hypernetworks_mlps.2.layers.0: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.2.layers.1: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.2.layers.2: 256x32, LoRA A: 1024, LoRA B: 128
  output_hypernetworks_mlps.3.layers.0: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.3.layers.1: 256x256, LoRA A: 1024, LoRA B: 1024
  output_hypernetworks_mlps.3.layers.2: 256x32, LoRA A: 1024, LoRA B: 128
  iou_prediction_head.layers.0: 256x256

In [5]:
#transforms/loader
from torchvision import transforms
transform = transforms.Compose([
    transforms.Resize((128, 128)), 
    transforms.ToTensor(),
])

COCO_DIR = "/kaggle/input/coco-2017-dataset/coco2017"
coco_dataset = TaskAwareDataset(
    data_dir=COCO_DIR,
    split="train",
    mode="instance",
    transform=transform,
    max_samples=500
)

from torch.utils.data import DataLoader
train_dataloader = DataLoader(
    coco_dataset,
    batch_size=1,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)


loading annotations into memory...
Done (t=21.56s)
creating index...
index created!


In [6]:
# train fcn
import os
import numpy as np
import torch
import torch.nn.functional as F
from tqdm import tqdm

def to_str_list(x):
    print(f"[DEBUG] raw task_description type: {type(x)}, value: {x}")
    if isinstance(x, str):
        return [x]
    if isinstance(x, torch.Tensor):
        if x.dtype == torch.object:
            return [str(xx) for xx in x]
        if x.numel() == 1:
            return [str(x.item())]
        else:
            return [str(xx.item()) for xx in x]
    if isinstance(x, bytes):
        return [x.decode("utf8")]
    if isinstance(x, tuple):
        return [str(xx) for xx in x]
    if isinstance(x, list):
        if all(isinstance(xx, list) for xx in x):
            return [str(jj) for xx in x for jj in xx]
        if all(isinstance(xx, torch.Tensor) for xx in x):
            return [str(xx.item()) if xx.numel() == 1 else str(xx) for xx in x]
        if all(isinstance(xx, bytes) for xx in x):
            return [xx.decode("utf8") for xx in x]
        if all(isinstance(xx, str) for xx in x):
            return x
        return [str(xx) for xx in x]
    return [str(x)]

optimizer = torch.optim.AdamW(
    hypernet.get_hypernetwork_params(),
    lr=base_cfg.training.learning_rate,
    weight_decay=base_cfg.training.weight_decay
)

def train_epoch(hypernetwork, sam_model, dataloader, optimizer, epoch, device):
    hypernetwork.train()
    sam_model.train()
    total_loss = 0

    pbar = tqdm(dataloader, desc=f'Epoch {epoch+1}')
    for batch_idx, batch in enumerate(pbar):
        images = batch['image'].to(device)          
        masks = batch['mask'].to(device)             
        raw_task_desc = batch['task_description']

        task_descriptions = to_str_list(raw_task_desc)
        print(f"[DEBUG] converted task_descriptions type: {type(task_descriptions)} value: {task_descriptions}")

        optimizer.zero_grad()

        # get LoRA weights from hypernetwork
        lora_weights = hypernetwork(task_descriptions)

        # debug: print out each adapter’s provided shape
        total_provided = 0
        print("LoRA weights from hypernetwork:")
        for name, w in lora_weights.items():
            print(f"  • {name:40s} provided → {tuple(w.shape)}  (numel={w.numel()})")
            total_provided += w.numel()
        print(f"  → total provided params: {total_provided}\n")

        #apply LoRA weights
        sam_model.apply_lora(lora_weights)

        #get and collect embeddings, squeezing out the extra dim
        image_embeddings = []
        for i in range(images.size(0)):
            img_np = images[i].permute(1, 2, 0).cpu().numpy()
            img_np = (img_np * 255).astype(np.uint8)
            emb = sam_model.get_image_embeddings(img_np)    
            emb = emb.squeeze(0)                            
            image_embeddings.append(emb)
        image_embeddings = torch.stack(image_embeddings).to(device)  

        #prepare points
        points = batch.get('points', None)
        point_labels = batch.get('point_labels', None)
        if points is None or point_labels is None:
            b, c, h, w = images.shape
            points = torch.tensor([[[w//2, h//2]]]*b).float().to(device)
            point_labels = torch.ones((b,1), dtype=torch.long).to(device)
        else:
            points = points.to(device)
            point_labels = point_labels.to(device)

        #forward + loss
        with torch.cuda.amp.autocast():
            predicted_masks, _ = sam_model(
                image_embeddings=image_embeddings,
                point_coords=points,
                point_labels=point_labels,
                multimask_output=False
            )  #predicted_masks: [B,1,256,256]

            #resize ground-truth masks to match output
            gt_masks = masks.unsqueeze(1).float()  
            gt_masks = F.interpolate(
                gt_masks,
                size=predicted_masks.shape[-2:],   #(256,256)
                mode='nearest'
            ) 

            loss = F.binary_cross_entropy_with_logits(predicted_masks, gt_masks)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})

        if batch_idx % 10 == 0:
            torch.cuda.empty_cache()

    return total_loss / len(dataloader)


In [8]:
# train
import os
import torch
dataloader = train_dataloader

ckpt_dir = "/mnt/data/checkpoints"
os.makedirs(ckpt_dir, exist_ok=True)

def train_model(epochs, dataloader):
    print("Starting training…")
    for epoch in range(epochs):
        avg_loss = train_epoch(hypernet, sam_model, dataloader, optimizer, epoch, device)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")
        # save every epoch
        checkpoint = {
            'epoch': epoch,
            'hypernetwork_state_dict': hypernet.state_dict(),
            'sam_state_dict': sam_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_loss
        }
        path = f"{ckpt_dir}/checkpoint_epoch_{epoch}.pth"
        torch.save(checkpoint, path)
        print(f" ✔️Saved checkpoint to {path}")
    print("Training completed!")

train_model(epochs=2, dataloader=train_dataloader)

Starting training…


Epoch 1:   0%|          | 0/500 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[DEBUG] raw task_description type: <class 'list'>, value: ['segment all baseball bat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all baseball bat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_B provided → (1, 4, 32)  (numel=128)
  • output_hypernetworks_mlps.1.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.1.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.1.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_

Epoch 1:   0%|          | 1/500 [00:02<22:42,  2.73s/it, loss=0.15]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all frisbee']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all frisbee']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:   0%|          | 2/500 [00:05<21:21,  2.57s/it, loss=0.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bird in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bird in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:   1%|          | 3/500 [00:07<20:59,  2.53s/it, loss=1.98]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:   1%|          | 4/500 [00:10<20:58,  2.54s/it, loss=1.49]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all white bird in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all white bird in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:   1%|          | 5/500 [00:12<21:01,  2.55s/it, loss=0.0436]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:   1%|          | 6/500 [00:15<21:03,  2.56s/it, loss=4.1]   

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all orange broccoli in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all orange broccoli in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 1:   1%|▏         | 7/500 [00:17<20:59,  2.55s/it, loss=1.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:   2%|▏         | 8/500 [00:20<20:50,  2.54s/it, loss=0.429]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:   2%|▏         | 9/500 [00:22<20:32,  2.51s/it, loss=5.26] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:   2%|▏         | 10/500 [00:25<20:14,  2.48s/it, loss=1.68]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:   2%|▏         | 11/500 [00:27<19:59,  2.45s/it, loss=0.443]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:   2%|▏         | 12/500 [00:30<19:42,  2.42s/it, loss=0.201]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all spoon']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all spoon']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:   3%|▎         | 13/500 [00:32<19:22,  2.39s/it, loss=0.502]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:   3%|▎         | 14/500 [00:34<19:07,  2.36s/it, loss=2.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all suitcase in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all suitcase in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:   3%|▎         | 15/500 [00:36<18:51,  2.33s/it, loss=1.84]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:   3%|▎         | 16/500 [00:39<18:36,  2.31s/it, loss=0.278]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all wide dog in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all wide dog in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:   3%|▎         | 17/500 [00:41<18:22,  2.28s/it, loss=0.578]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:   4%|▎         | 18/500 [00:43<18:10,  2.26s/it, loss=1.58] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment pink sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment pink sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:   4%|▍         | 19/500 [00:45<18:03,  2.25s/it, loss=0.374]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:   4%|▍         | 20/500 [00:48<17:55,  2.24s/it, loss=3.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all pink laptop objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all pink laptop objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:   4%|▍         | 21/500 [00:50<17:50,  2.24s/it, loss=4.78]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:   4%|▍         | 22/500 [00:52<17:46,  2.23s/it, loss=1.71]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:   5%|▍         | 23/500 [00:54<17:37,  2.22s/it, loss=0.489]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all rectangular horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all rectangular horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_h

Epoch 1:   5%|▍         | 24/500 [00:56<17:31,  2.21s/it, loss=1.68] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:   5%|▌         | 25/500 [00:59<17:26,  2.20s/it, loss=1.45]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all gray elephant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all gray elephant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 1:   5%|▌         | 26/500 [01:01<17:22,  2.20s/it, loss=2.89]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment wide chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment wide chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:   5%|▌         | 27/500 [01:03<17:21,  2.20s/it, loss=1.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bird in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bird in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:   6%|▌         | 28/500 [01:05<17:17,  2.20s/it, loss=2.95]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:   6%|▌         | 29/500 [01:07<17:18,  2.21s/it, loss=1.23]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:   6%|▌         | 30/500 [01:10<17:19,  2.21s/it, loss=0.813]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:   6%|▌         | 31/500 [01:12<17:22,  2.22s/it, loss=1.07] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 1:   6%|▋         | 32/500 [01:14<17:22,  2.23s/it, loss=2.02]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all donut from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all donut from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:   7%|▋         | 33/500 [01:16<17:22,  2.23s/it, loss=0.978]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:   7%|▋         | 34/500 [01:19<17:21,  2.23s/it, loss=0.491]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:   7%|▋         | 35/500 [01:21<17:20,  2.24s/it, loss=1.43] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all horse regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all horse regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:   7%|▋         | 36/500 [01:23<17:21,  2.25s/it, loss=2.85]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment large person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment large person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:   7%|▋         | 37/500 [01:25<17:23,  2.25s/it, loss=5.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:   8%|▊         | 38/500 [01:28<17:26,  2.26s/it, loss=0.272]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:   8%|▊         | 39/500 [01:30<17:26,  2.27s/it, loss=0.897]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all tall bench from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all tall bench from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 1:   8%|▊         | 40/500 [01:32<17:27,  2.28s/it, loss=0.473]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment silver handbag']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment silver handbag']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:   8%|▊         | 41/500 [01:35<17:33,  2.30s/it, loss=0.713]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all red backpack']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all red backpack']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:   8%|▊         | 42/500 [01:37<17:35,  2.31s/it, loss=0.446]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:   9%|▊         | 43/500 [01:39<17:34,  2.31s/it, loss=1.84] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment blue sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment blue sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:   9%|▉         | 44/500 [01:41<17:32,  2.31s/it, loss=0.808]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:   9%|▉         | 45/500 [01:44<17:28,  2.30s/it, loss=0.589]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tall cat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tall cat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:   9%|▉         | 46/500 [01:46<17:26,  2.30s/it, loss=1.01] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all chair in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all chair in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:   9%|▉         | 47/500 [01:48<17:24,  2.31s/it, loss=1.54]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  10%|▉         | 48/500 [01:51<17:21,  2.31s/it, loss=4.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all giraffe objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all giraffe objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  10%|▉         | 49/500 [01:53<17:19,  2.30s/it, loss=1.42]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  10%|█         | 50/500 [01:55<17:16,  2.30s/it, loss=3.62]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all silver cow regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all silver cow regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  10%|█         | 51/500 [01:58<17:16,  2.31s/it, loss=0.952]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  10%|█         | 52/500 [02:00<17:12,  2.30s/it, loss=0.265]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  11%|█         | 53/500 [02:02<17:07,  2.30s/it, loss=0.838]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  11%|█         | 54/500 [02:04<17:04,  2.30s/it, loss=3.63] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all zebra in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all zebra in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  11%|█         | 55/500 [02:07<17:00,  2.29s/it, loss=1.86]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sink regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sink regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  11%|█         | 56/500 [02:09<16:56,  2.29s/it, loss=2.35]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all purple person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all purple person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  11%|█▏        | 57/500 [02:11<16:53,  2.29s/it, loss=0.363]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every white person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every white person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  12%|█▏        | 58/500 [02:14<16:50,  2.29s/it, loss=1.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment fire hydrant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment fire hydrant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  12%|█▏        | 59/500 [02:16<16:46,  2.28s/it, loss=0.928]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sports ball regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sports ball regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  12%|█▏        | 60/500 [02:18<16:43,  2.28s/it, loss=0.402]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bowl']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bowl']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  12%|█▏        | 61/500 [02:20<16:41,  2.28s/it, loss=0.923]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all tie from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all tie from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  12%|█▏        | 62/500 [02:23<16:37,  2.28s/it, loss=0.744]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all gray toilet in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all gray toilet in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  13%|█▎        | 63/500 [02:25<16:32,  2.27s/it, loss=3.82] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all purple bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all purple bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 1:  13%|█▎        | 64/500 [02:27<16:28,  2.27s/it, loss=1.35]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  13%|█▎        | 65/500 [02:30<16:26,  2.27s/it, loss=0.878]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  13%|█▎        | 66/500 [02:32<16:22,  2.26s/it, loss=0.348]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all motorcycle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all motorcycle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 1:  13%|█▎        | 67/500 [02:34<16:17,  2.26s/it, loss=1.64] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  14%|█▎        | 68/500 [02:36<16:13,  2.25s/it, loss=1.83]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all mouse in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all mouse in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  14%|█▍        | 69/500 [02:39<16:11,  2.25s/it, loss=0.523]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every green giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every green giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  14%|█▍        | 70/500 [02:41<16:08,  2.25s/it, loss=1.09] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  14%|█▍        | 71/500 [02:43<16:07,  2.26s/it, loss=2.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  14%|█▍        | 72/500 [02:45<16:05,  2.26s/it, loss=0.681]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all elephant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all elephant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  15%|█▍        | 73/500 [02:48<16:01,  2.25s/it, loss=1.68] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  15%|█▍        | 74/500 [02:50<15:57,  2.25s/it, loss=0.238]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment vase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment vase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  15%|█▌        | 75/500 [02:52<15:55,  2.25s/it, loss=0.528]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every gray chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every gray chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  15%|█▌        | 76/500 [02:54<15:53,  2.25s/it, loss=5.51] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  15%|█▌        | 77/500 [02:57<15:50,  2.25s/it, loss=1.3] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bottle objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bottle objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  16%|█▌        | 78/500 [02:59<15:48,  2.25s/it, loss=1.86]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all tie regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all tie regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  16%|█▌        | 79/500 [03:01<15:49,  2.26s/it, loss=0.845]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  16%|█▌        | 80/500 [03:03<15:49,  2.26s/it, loss=1.38] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all white tv in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all white tv in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  16%|█▌        | 81/500 [03:06<15:52,  2.27s/it, loss=4.38]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all traffic light in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all traffic light in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  16%|█▋        | 82/500 [03:08<15:51,  2.28s/it, loss=0.472]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every black person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every black person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  17%|█▋        | 83/500 [03:10<15:49,  2.28s/it, loss=0.604]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all dog from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all dog from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  17%|█▋        | 84/500 [03:12<15:47,  2.28s/it, loss=0.385]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  17%|█▋        | 85/500 [03:15<15:44,  2.28s/it, loss=5.76] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  17%|█▋        | 86/500 [03:17<15:43,  2.28s/it, loss=0.717]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment long cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment long cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  17%|█▋        | 87/500 [03:19<15:43,  2.28s/it, loss=1.1]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  18%|█▊        | 88/500 [03:22<15:41,  2.28s/it, loss=0.875]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all sheep from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all sheep from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  18%|█▊        | 89/500 [03:24<15:39,  2.29s/it, loss=1.98] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all handbag']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all handbag']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  18%|█▊        | 90/500 [03:26<15:39,  2.29s/it, loss=0.308]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all train from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all train from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  18%|█▊        | 91/500 [03:28<15:40,  2.30s/it, loss=1.48] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  18%|█▊        | 92/500 [03:31<15:38,  2.30s/it, loss=0.287]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all tie from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all tie from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  19%|█▊        | 93/500 [03:33<15:35,  2.30s/it, loss=4.55] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  19%|█▉        | 94/500 [03:35<15:33,  2.30s/it, loss=4.02]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  19%|█▉        | 95/500 [03:38<15:30,  2.30s/it, loss=0.68]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 1:  19%|█▉        | 96/500 [03:40<15:27,  2.30s/it, loss=4.72]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  19%|█▉        | 97/500 [03:42<15:25,  2.30s/it, loss=0.662]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  20%|█▉        | 98/500 [03:45<15:22,  2.29s/it, loss=2.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  20%|█▉        | 99/500 [03:47<15:20,  2.29s/it, loss=2.24]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every white truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every white truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  20%|██        | 100/500 [03:49<15:16,  2.29s/it, loss=2.66]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all boat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all boat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  20%|██        | 101/500 [03:51<15:14,  2.29s/it, loss=2.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all tall toilet regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all tall toilet regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  20%|██        | 102/500 [03:54<15:12,  2.29s/it, loss=0.384]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all round boat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all round boat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  21%|██        | 103/500 [03:56<15:08,  2.29s/it, loss=2.72] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every thin bottle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every thin bottle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  21%|██        | 104/500 [03:58<15:05,  2.29s/it, loss=0.318]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all laptop in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all laptop in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  21%|██        | 105/500 [04:01<15:00,  2.28s/it, loss=3.4]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all orange potted plant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all orange potted plant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 1:  21%|██        | 106/500 [04:03<14:56,  2.27s/it, loss=4.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  21%|██▏       | 107/500 [04:05<14:52,  2.27s/it, loss=4.84]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  22%|██▏       | 108/500 [04:07<14:47,  2.26s/it, loss=5.26]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all black zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all black zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  22%|██▏       | 109/500 [04:10<14:44,  2.26s/it, loss=0.744]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every thin car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every thin car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  22%|██▏       | 110/500 [04:12<14:43,  2.27s/it, loss=1.72] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  22%|██▏       | 111/500 [04:14<14:44,  2.27s/it, loss=0.447]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bottle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bottle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  22%|██▏       | 112/500 [04:16<14:43,  2.28s/it, loss=0.623]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  23%|██▎       | 113/500 [04:19<14:38,  2.27s/it, loss=1.43] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all round car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all round car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  23%|██▎       | 114/500 [04:21<14:34,  2.27s/it, loss=3.59]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all dining table regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all dining table regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  23%|██▎       | 115/500 [04:23<14:32,  2.27s/it, loss=1.42]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all remote objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all remote objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  23%|██▎       | 116/500 [04:25<14:29,  2.27s/it, loss=1.16]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all blue couch regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all blue couch regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  23%|██▎       | 117/500 [04:28<14:27,  2.26s/it, loss=1.68]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  24%|██▎       | 118/500 [04:30<14:24,  2.26s/it, loss=1.02]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  24%|██▍       | 119/500 [04:32<14:22,  2.26s/it, loss=2.9] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all dog in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all dog in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  24%|██▍       | 120/500 [04:35<14:19,  2.26s/it, loss=3.17]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all cake objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all cake objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  24%|██▍       | 121/500 [04:37<14:20,  2.27s/it, loss=3.29]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  24%|██▍       | 122/500 [04:39<14:17,  2.27s/it, loss=1.76]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all boat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all boat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  25%|██▍       | 123/500 [04:41<14:13,  2.26s/it, loss=3.2] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  25%|██▍       | 124/500 [04:44<14:09,  2.26s/it, loss=0.328]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment fire hydrant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment fire hydrant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  25%|██▌       | 125/500 [04:46<14:07,  2.26s/it, loss=0.154]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all large elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all large elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  25%|██▌       | 126/500 [04:48<14:04,  2.26s/it, loss=0.414]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all square sheep in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all square sheep in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  25%|██▌       | 127/500 [04:50<14:02,  2.26s/it, loss=0.668]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment silver horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment silver horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  26%|██▌       | 128/500 [04:53<13:59,  2.26s/it, loss=0.458]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment long cup']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment long cup']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  26%|██▌       | 129/500 [04:55<13:58,  2.26s/it, loss=0.228]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  26%|██▌       | 130/500 [04:57<13:56,  2.26s/it, loss=3.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all handbag from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all handbag from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  26%|██▌       | 131/500 [04:59<13:57,  2.27s/it, loss=0.198]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  26%|██▋       | 132/500 [05:02<13:54,  2.27s/it, loss=0.479]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bench']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bench']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  27%|██▋       | 133/500 [05:04<13:50,  2.26s/it, loss=0.56] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  27%|██▋       | 134/500 [05:06<13:47,  2.26s/it, loss=1.15]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all orange person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all orange person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  27%|██▋       | 135/500 [05:08<13:44,  2.26s/it, loss=0.915]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  27%|██▋       | 136/500 [05:11<13:40,  2.25s/it, loss=0.988]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment gray giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment gray giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  27%|██▋       | 137/500 [05:13<13:38,  2.26s/it, loss=1.37] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  28%|██▊       | 138/500 [05:15<13:37,  2.26s/it, loss=1.49]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all cup']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all cup']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 1:  28%|██▊       | 139/500 [05:17<13:34,  2.26s/it, loss=0.591]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bowl regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bowl regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  28%|██▊       | 140/500 [05:20<13:31,  2.26s/it, loss=0.971]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  28%|██▊       | 141/500 [05:22<13:33,  2.27s/it, loss=0.94] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cow in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cow in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  28%|██▊       | 142/500 [05:24<13:28,  2.26s/it, loss=3.26]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all triangular person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all triangular person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  29%|██▊       | 143/500 [05:27<13:27,  2.26s/it, loss=3.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  29%|██▉       | 144/500 [05:29<13:25,  2.26s/it, loss=3.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment brown bottle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment brown bottle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  29%|██▉       | 145/500 [05:31<13:23,  2.26s/it, loss=0.221]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every silver bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every silver bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  29%|██▉       | 146/500 [05:33<13:21,  2.26s/it, loss=0.772]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bottle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bottle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  29%|██▉       | 147/500 [05:36<13:18,  2.26s/it, loss=0.169]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sink regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sink regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  30%|██▉       | 148/500 [05:38<13:16,  2.26s/it, loss=1.38] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  30%|██▉       | 149/500 [05:40<13:13,  2.26s/it, loss=1.26]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  30%|███       | 150/500 [05:42<13:12,  2.26s/it, loss=2.54]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  30%|███       | 151/500 [05:45<13:13,  2.27s/it, loss=0.258]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment wide person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment wide person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  30%|███       | 152/500 [05:47<13:11,  2.27s/it, loss=0.229]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all brown truck from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all brown truck from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 1:  31%|███       | 153/500 [05:49<13:09,  2.27s/it, loss=0.136]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bottle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bottle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  31%|███       | 154/500 [05:51<13:06,  2.27s/it, loss=0.231]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all long bus regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all long bus regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  31%|███       | 155/500 [05:54<13:03,  2.27s/it, loss=0.279]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all dog objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all dog objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  31%|███       | 156/500 [05:56<13:00,  2.27s/it, loss=0.149]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  31%|███▏      | 157/500 [05:58<12:59,  2.27s/it, loss=2.65] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  32%|███▏      | 158/500 [06:01<12:57,  2.27s/it, loss=1.49]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all elephant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all elephant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  32%|███▏      | 159/500 [06:03<12:56,  2.28s/it, loss=1.53]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment black elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment black elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  32%|███▏      | 160/500 [06:05<12:54,  2.28s/it, loss=2.22]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bird objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bird objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  32%|███▏      | 161/500 [06:07<12:54,  2.28s/it, loss=2.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  32%|███▏      | 162/500 [06:10<12:52,  2.29s/it, loss=2.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all elephant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all elephant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  33%|███▎      | 163/500 [06:12<12:50,  2.29s/it, loss=2.57]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  33%|███▎      | 164/500 [06:14<12:47,  2.28s/it, loss=0.986]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  33%|███▎      | 165/500 [06:17<12:44,  2.28s/it, loss=5.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment microwave']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment microwave']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  33%|███▎      | 166/500 [06:19<12:41,  2.28s/it, loss=1.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment brown elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment brown elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  33%|███▎      | 167/500 [06:21<12:38,  2.28s/it, loss=0.804]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  34%|███▎      | 168/500 [06:23<12:36,  2.28s/it, loss=2]    

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment gray bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment gray bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  34%|███▍      | 169/500 [06:26<12:33,  2.28s/it, loss=2.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment small bench']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment small bench']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  34%|███▍      | 170/500 [06:28<12:31,  2.28s/it, loss=0.544]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all sheep objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all sheep objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  34%|███▍      | 171/500 [06:30<12:32,  2.29s/it, loss=0.225]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all sheep in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all sheep in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  34%|███▍      | 172/500 [06:33<12:30,  2.29s/it, loss=0.113]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all boat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all boat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  35%|███▍      | 173/500 [06:35<12:28,  2.29s/it, loss=4.56] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  35%|███▍      | 174/500 [06:37<12:27,  2.29s/it, loss=0.885]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  35%|███▌      | 175/500 [06:39<12:23,  2.29s/it, loss=4.68] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  35%|███▌      | 176/500 [06:42<12:21,  2.29s/it, loss=2.91]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  35%|███▌      | 177/500 [06:44<12:18,  2.29s/it, loss=2.79]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all orange from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all orange from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  36%|███▌      | 178/500 [06:46<12:16,  2.29s/it, loss=1.37]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all banana objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all banana objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  36%|███▌      | 179/500 [06:49<12:14,  2.29s/it, loss=1.93]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tall tv in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tall tv in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  36%|███▌      | 180/500 [06:51<12:11,  2.29s/it, loss=1.96]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all elephant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all elephant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  36%|███▌      | 181/500 [06:53<12:11,  2.29s/it, loss=2.04]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  36%|███▋      | 182/500 [06:55<12:09,  2.29s/it, loss=0.696]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment vase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment vase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  37%|███▋      | 183/500 [06:58<12:06,  2.29s/it, loss=0.628]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bear in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bear in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  37%|███▋      | 184/500 [07:00<12:03,  2.29s/it, loss=1.69] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment tall stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment tall stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  37%|███▋      | 185/500 [07:02<11:59,  2.29s/it, loss=0.447]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bowl']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bowl']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  37%|███▋      | 186/500 [07:05<11:57,  2.29s/it, loss=0.169]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  37%|███▋      | 187/500 [07:07<11:55,  2.29s/it, loss=2.09] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  38%|███▊      | 188/500 [07:09<11:52,  2.29s/it, loss=0.853]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  38%|███▊      | 189/500 [07:11<11:51,  2.29s/it, loss=1.63] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all red tie']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all red tie']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  38%|███▊      | 190/500 [07:14<11:49,  2.29s/it, loss=8.72]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment triangular person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment triangular person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 1:  38%|███▊      | 191/500 [07:16<11:49,  2.29s/it, loss=0.789]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tall motorcycle objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tall motorcycle objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  38%|███▊      | 192/500 [07:18<11:45,  2.29s/it, loss=2.61] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  39%|███▊      | 193/500 [07:21<11:42,  2.29s/it, loss=0.706]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  39%|███▉      | 194/500 [07:23<11:38,  2.28s/it, loss=0.395]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all sink from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all sink from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  39%|███▉      | 195/500 [07:25<11:36,  2.28s/it, loss=0.922]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  39%|███▉      | 196/500 [07:27<11:32,  2.28s/it, loss=1.85] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every tall bottle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every tall bottle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  39%|███▉      | 197/500 [07:30<11:28,  2.27s/it, loss=0.43]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment parking meter']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment parking meter']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  40%|███▉      | 198/500 [07:32<11:26,  2.27s/it, loss=1.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment dining table']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment dining table']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  40%|███▉      | 199/500 [07:34<11:23,  2.27s/it, loss=2.41]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all wine glass regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all wine glass regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  40%|████      | 200/500 [07:37<11:22,  2.27s/it, loss=0.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all suitcase regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all suitcase regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  40%|████      | 201/500 [07:39<11:21,  2.28s/it, loss=1.32]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all sink from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all sink from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  40%|████      | 202/500 [07:41<11:19,  2.28s/it, loss=0.915]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  41%|████      | 203/500 [07:43<11:16,  2.28s/it, loss=1.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  41%|████      | 204/500 [07:46<11:12,  2.27s/it, loss=3.81]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment blue person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment blue person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  41%|████      | 205/500 [07:48<11:09,  2.27s/it, loss=0.171]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  41%|████      | 206/500 [07:50<11:06,  2.27s/it, loss=2.76] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all suitcase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all suitcase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  41%|████▏     | 207/500 [07:52<11:04,  2.27s/it, loss=2.76]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all oval sheep in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all oval sheep in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  42%|████▏     | 208/500 [07:55<11:02,  2.27s/it, loss=1.48]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment black person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment black person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  42%|████▏     | 209/500 [07:57<11:00,  2.27s/it, loss=2.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all large clock objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all large clock objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  42%|████▏     | 210/500 [07:59<10:58,  2.27s/it, loss=1.09]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  42%|████▏     | 211/500 [08:02<10:57,  2.28s/it, loss=1.1] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all blue toilet objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all blue toilet objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  42%|████▏     | 212/500 [08:04<10:56,  2.28s/it, loss=0.38]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  43%|████▎     | 213/500 [08:06<10:53,  2.28s/it, loss=1]   

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every thin zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every thin zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  43%|████▎     | 214/500 [08:08<10:50,  2.27s/it, loss=0.453]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all long person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all long person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  43%|████▎     | 215/500 [08:11<10:48,  2.28s/it, loss=2.14] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  43%|████▎     | 216/500 [08:13<10:45,  2.27s/it, loss=3.95]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all backpack in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all backpack in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  43%|████▎     | 217/500 [08:15<10:43,  2.27s/it, loss=0.994]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  44%|████▎     | 218/500 [08:17<10:40,  2.27s/it, loss=2.21] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all stop sign in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all stop sign in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  44%|████▍     | 219/500 [08:20<10:38,  2.27s/it, loss=0.729]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bicycle objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bicycle objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  44%|████▍     | 220/500 [08:22<10:36,  2.27s/it, loss=1.59] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all pink sheep regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all pink sheep regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  44%|████▍     | 221/500 [08:24<10:35,  2.28s/it, loss=1.24]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all handbag from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all handbag from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  44%|████▍     | 222/500 [08:27<10:34,  2.28s/it, loss=0.631]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bus']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bus']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  45%|████▍     | 223/500 [08:29<10:32,  2.28s/it, loss=5.12] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  45%|████▍     | 224/500 [08:31<10:29,  2.28s/it, loss=0.349]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment parking meter']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment parking meter']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  45%|████▌     | 225/500 [08:33<10:27,  2.28s/it, loss=0.954]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all round chair regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all round chair regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  45%|████▌     | 226/500 [08:36<10:24,  2.28s/it, loss=0.134]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all fire hydrant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all fire hydrant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  45%|████▌     | 227/500 [08:38<10:22,  2.28s/it, loss=1.25] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all truck from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all truck from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  46%|████▌     | 228/500 [08:40<10:20,  2.28s/it, loss=0.694]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bottle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bottle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  46%|████▌     | 229/500 [08:43<10:18,  2.28s/it, loss=0.271]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  46%|████▌     | 230/500 [08:45<10:15,  2.28s/it, loss=0.6]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all thin zebra objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all thin zebra objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  46%|████▌     | 231/500 [08:47<10:15,  2.29s/it, loss=2.41]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all rectangular airplane objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all rectangular airplane objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  46%|████▋     | 232/500 [08:49<10:14,  2.29s/it, loss=2.44]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  47%|████▋     | 233/500 [08:52<10:10,  2.29s/it, loss=2.98]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  47%|████▋     | 234/500 [08:54<10:07,  2.29s/it, loss=0.134]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all refrigerator regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all refrigerator regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  47%|████▋     | 235/500 [08:56<10:05,  2.28s/it, loss=0.768]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment silver horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment silver horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  47%|████▋     | 236/500 [08:59<10:03,  2.29s/it, loss=1.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  47%|████▋     | 237/500 [09:01<10:00,  2.29s/it, loss=0.0714]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment blue keyboard']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment blue keyboard']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  48%|████▊     | 238/500 [09:03<09:57,  2.28s/it, loss=5.03]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  48%|████▊     | 239/500 [09:05<09:54,  2.28s/it, loss=0.238]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  48%|████▊     | 240/500 [09:08<09:52,  2.28s/it, loss=4.46] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  48%|████▊     | 241/500 [09:10<09:52,  2.29s/it, loss=2.59]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all small bus']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all small bus']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 1:  48%|████▊     | 242/500 [09:12<09:50,  2.29s/it, loss=1.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment couch']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment couch']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  49%|████▊     | 243/500 [09:15<09:47,  2.29s/it, loss=1.69]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  49%|████▉     | 244/500 [09:17<09:45,  2.29s/it, loss=3.49]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  49%|████▉     | 245/500 [09:19<09:42,  2.29s/it, loss=1.34]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all handbag']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all handbag']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  49%|████▉     | 246/500 [09:21<09:40,  2.29s/it, loss=0.354]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every fire hydrant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every fire hydrant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  49%|████▉     | 247/500 [09:24<09:38,  2.29s/it, loss=2.34] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  50%|████▉     | 248/500 [09:26<09:35,  2.28s/it, loss=3.84]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  50%|████▉     | 249/500 [09:28<09:33,  2.29s/it, loss=3.9] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every wide bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every wide bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  50%|█████     | 250/500 [09:31<09:31,  2.29s/it, loss=0.844]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every circular umbrella']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every circular umbrella']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  50%|█████     | 251/500 [09:33<09:31,  2.29s/it, loss=0.342]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all umbrella from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all umbrella from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  50%|█████     | 252/500 [09:35<09:28,  2.29s/it, loss=2.4]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  51%|█████     | 253/500 [09:37<09:25,  2.29s/it, loss=1.04]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  51%|█████     | 254/500 [09:40<09:23,  2.29s/it, loss=2.83]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  51%|█████     | 255/500 [09:42<09:21,  2.29s/it, loss=2.11]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all giraffe objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all giraffe objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  51%|█████     | 256/500 [09:44<09:18,  2.29s/it, loss=1.65]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every white bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every white bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  51%|█████▏    | 257/500 [09:47<09:15,  2.29s/it, loss=2.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  52%|█████▏    | 258/500 [09:49<09:14,  2.29s/it, loss=0.195]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  52%|█████▏    | 259/500 [09:51<09:11,  2.29s/it, loss=0.815]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 1:  52%|█████▏    | 260/500 [09:53<09:09,  2.29s/it, loss=0.688]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment motorcycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment motorcycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  52%|█████▏    | 261/500 [09:56<09:09,  2.30s/it, loss=2.03] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all square bench from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all square bench from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  52%|█████▏    | 262/500 [09:58<09:06,  2.30s/it, loss=0.919]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all gray cow objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all gray cow objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  53%|█████▎    | 263/500 [10:00<09:03,  2.29s/it, loss=0.921]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment round person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment round person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  53%|█████▎    | 264/500 [10:03<09:00,  2.29s/it, loss=4.53] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  53%|█████▎    | 265/500 [10:05<08:57,  2.29s/it, loss=1.5] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all purple bowl']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all purple bowl']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  53%|█████▎    | 266/500 [10:07<08:54,  2.28s/it, loss=0.233]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all boat objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all boat objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  53%|█████▎    | 267/500 [10:09<08:51,  2.28s/it, loss=2.64] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all vase in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all vase in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  54%|█████▎    | 268/500 [10:12<08:49,  2.28s/it, loss=0.116]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment remote']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment remote']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  54%|█████▍    | 269/500 [10:14<08:47,  2.29s/it, loss=0.7]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 1:  54%|█████▍    | 270/500 [10:16<08:45,  2.28s/it, loss=0.292]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  54%|█████▍    | 271/500 [10:19<08:43,  2.29s/it, loss=2.65] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bench regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bench regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  54%|█████▍    | 272/500 [10:21<08:41,  2.29s/it, loss=0.247]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all white person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all white person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  55%|█████▍    | 273/500 [10:23<08:38,  2.28s/it, loss=4.11] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  55%|█████▍    | 274/500 [10:25<08:35,  2.28s/it, loss=1.72]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all orange clock from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all orange clock from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  55%|█████▌    | 275/500 [10:28<08:32,  2.28s/it, loss=3.65]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all dining table from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all dining table from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  55%|█████▌    | 276/500 [10:30<08:29,  2.27s/it, loss=1.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  55%|█████▌    | 277/500 [10:32<08:26,  2.27s/it, loss=1.03]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  56%|█████▌    | 278/500 [10:35<08:24,  2.27s/it, loss=1.74]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment gray cup']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment gray cup']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  56%|█████▌    | 279/500 [10:37<08:22,  2.27s/it, loss=0.312]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all triangular zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all triangular zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  56%|█████▌    | 280/500 [10:39<08:20,  2.27s/it, loss=2.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment black airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment black airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  56%|█████▌    | 281/500 [10:41<08:18,  2.28s/it, loss=3.77]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  56%|█████▋    | 282/500 [10:44<08:16,  2.28s/it, loss=1.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  57%|█████▋    | 283/500 [10:46<08:13,  2.27s/it, loss=2.29]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all truck objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all truck objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  57%|█████▋    | 284/500 [10:48<08:11,  2.27s/it, loss=0.82]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every frisbee']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every frisbee']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 1:  57%|█████▋    | 285/500 [10:50<08:08,  2.27s/it, loss=0.746]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  57%|█████▋    | 286/500 [10:53<08:05,  2.27s/it, loss=0.582]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  57%|█████▋    | 287/500 [10:55<08:03,  2.27s/it, loss=4.14] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all round cat objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all round cat objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  58%|█████▊    | 288/500 [10:57<08:01,  2.27s/it, loss=0.606]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every tall cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every tall cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  58%|█████▊    | 289/500 [11:00<07:59,  2.27s/it, loss=3.24] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 1:  58%|█████▊    | 290/500 [11:02<07:56,  2.27s/it, loss=1.43]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  58%|█████▊    | 291/500 [11:04<07:55,  2.28s/it, loss=2.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  58%|█████▊    | 292/500 [11:06<07:53,  2.28s/it, loss=2.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  59%|█████▊    | 293/500 [11:09<07:51,  2.28s/it, loss=0.89]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  59%|█████▉    | 294/500 [11:11<07:48,  2.27s/it, loss=2.23]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  59%|█████▉    | 295/500 [11:13<07:45,  2.27s/it, loss=0.725]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all giraffe regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all giraffe regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  59%|█████▉    | 296/500 [11:15<07:43,  2.27s/it, loss=2.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 1:  59%|█████▉    | 297/500 [11:18<07:41,  2.27s/it, loss=1.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all thin boat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all thin boat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  60%|█████▉    | 298/500 [11:20<07:38,  2.27s/it, loss=1.03]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all keyboard regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all keyboard regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  60%|█████▉    | 299/500 [11:22<07:36,  2.27s/it, loss=0.684]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  60%|██████    | 300/500 [11:25<07:34,  2.27s/it, loss=2.9]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all gray book objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all gray book objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  60%|██████    | 301/500 [11:27<07:34,  2.28s/it, loss=0.533]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tie in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tie in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  60%|██████    | 302/500 [11:29<07:32,  2.28s/it, loss=0.752]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all large sink objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all large sink objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  61%|██████    | 303/500 [11:31<07:29,  2.28s/it, loss=1.47] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  61%|██████    | 304/500 [11:34<07:27,  2.28s/it, loss=0.101]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  61%|██████    | 305/500 [11:36<07:24,  2.28s/it, loss=2.56] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all horse in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all horse in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  61%|██████    | 306/500 [11:38<07:22,  2.28s/it, loss=3.03]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  61%|██████▏   | 307/500 [11:41<07:19,  2.28s/it, loss=1.53]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment vase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment vase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  62%|██████▏   | 308/500 [11:43<07:17,  2.28s/it, loss=0.388]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all small refrigerator']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all small refrigerator']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  62%|██████▏   | 309/500 [11:45<07:15,  2.28s/it, loss=1.47] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  62%|██████▏   | 310/500 [11:47<07:13,  2.28s/it, loss=1.37]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  62%|██████▏   | 311/500 [11:50<07:11,  2.28s/it, loss=3.81]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  62%|██████▏   | 312/500 [11:52<07:09,  2.28s/it, loss=0.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  63%|██████▎   | 313/500 [11:54<07:07,  2.28s/it, loss=2.71]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  63%|██████▎   | 314/500 [11:57<07:04,  2.28s/it, loss=0.394]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all blue sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all blue sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  63%|██████▎   | 315/500 [11:59<07:02,  2.29s/it, loss=1.21] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all zebra in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all zebra in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  63%|██████▎   | 316/500 [12:01<06:59,  2.28s/it, loss=1.55]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  63%|██████▎   | 317/500 [12:03<06:58,  2.28s/it, loss=0.17]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all black giraffe from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all black giraffe from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 1:  64%|██████▎   | 318/500 [12:06<06:55,  2.28s/it, loss=1.57]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all motorcycle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all motorcycle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 1:  64%|██████▍   | 319/500 [12:08<06:52,  2.28s/it, loss=0.447]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  64%|██████▍   | 320/500 [12:10<06:50,  2.28s/it, loss=0.682]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  64%|██████▍   | 321/500 [12:12<06:49,  2.29s/it, loss=2.37] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  64%|██████▍   | 322/500 [12:15<06:46,  2.29s/it, loss=0.601]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bird objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bird objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  65%|██████▍   | 323/500 [12:17<06:43,  2.28s/it, loss=1.7]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all round dog from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all round dog from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 1:  65%|██████▍   | 324/500 [12:19<06:41,  2.28s/it, loss=2.02]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment banana']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment banana']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  65%|██████▌   | 325/500 [12:22<06:38,  2.28s/it, loss=4.06]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all motorcycle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all motorcycle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 1:  65%|██████▌   | 326/500 [12:24<06:36,  2.28s/it, loss=1.63]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  65%|██████▌   | 327/500 [12:26<06:33,  2.27s/it, loss=1.24]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all traffic light from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all traffic light from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 1:  66%|██████▌   | 328/500 [12:28<06:30,  2.27s/it, loss=0.705]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bird objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bird objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  66%|██████▌   | 329/500 [12:31<06:27,  2.27s/it, loss=3.57] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 1:  66%|██████▌   | 330/500 [12:33<06:24,  2.26s/it, loss=2.2] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all backpack in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all backpack in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  66%|██████▌   | 331/500 [12:35<06:23,  2.27s/it, loss=0.206]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all zebra from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all zebra from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  66%|██████▋   | 332/500 [12:37<06:20,  2.27s/it, loss=0.962]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all long person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all long person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  67%|██████▋   | 333/500 [12:40<06:18,  2.26s/it, loss=0.351]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all rectangular toilet from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all rectangular toilet from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output

Epoch 1:  67%|██████▋   | 334/500 [12:42<06:15,  2.26s/it, loss=1.44] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  67%|██████▋   | 335/500 [12:44<06:13,  2.26s/it, loss=0.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  67%|██████▋   | 336/500 [12:47<06:11,  2.27s/it, loss=0.899]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  67%|██████▋   | 337/500 [12:49<06:08,  2.26s/it, loss=5.91] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all thin zebra objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all thin zebra objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  68%|██████▊   | 338/500 [12:51<06:06,  2.27s/it, loss=5.97]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  68%|██████▊   | 339/500 [12:53<06:05,  2.27s/it, loss=0.48]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all long elephant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all long elephant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  68%|██████▊   | 340/500 [12:56<06:02,  2.27s/it, loss=4.67]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment brown person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment brown person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  68%|██████▊   | 341/500 [12:58<06:01,  2.27s/it, loss=0.25]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment brown dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment brown dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  68%|██████▊   | 342/500 [13:00<05:59,  2.28s/it, loss=2.52]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  69%|██████▊   | 343/500 [13:02<05:57,  2.28s/it, loss=0.599]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 1:  69%|██████▉   | 344/500 [13:05<05:54,  2.27s/it, loss=2.11] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all toilet from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all toilet from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  69%|██████▉   | 345/500 [13:07<05:51,  2.27s/it, loss=0.149]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all long person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all long person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  69%|██████▉   | 346/500 [13:09<05:49,  2.27s/it, loss=5]    

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  69%|██████▉   | 347/500 [13:11<05:46,  2.27s/it, loss=0.513]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all traffic light objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all traffic light objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  70%|██████▉   | 348/500 [13:14<05:44,  2.27s/it, loss=2.83] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment circular horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment circular horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  70%|██████▉   | 349/500 [13:16<05:42,  2.27s/it, loss=4.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  70%|███████   | 350/500 [13:18<05:40,  2.27s/it, loss=0.481]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all gray traffic light from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all gray traffic light from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output

Epoch 1:  70%|███████   | 351/500 [13:21<05:39,  2.28s/it, loss=3.81] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all elephant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all elephant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  70%|███████   | 352/500 [13:23<05:36,  2.27s/it, loss=2.24]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all pink bicycle regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all pink bicycle regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  71%|███████   | 353/500 [13:25<05:34,  2.28s/it, loss=0.875]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  71%|███████   | 354/500 [13:27<05:31,  2.27s/it, loss=2.15] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  71%|███████   | 355/500 [13:30<05:28,  2.27s/it, loss=1.27]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  71%|███████   | 356/500 [13:32<05:26,  2.27s/it, loss=1.3] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every silver zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every silver zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  71%|███████▏  | 357/500 [13:34<05:24,  2.27s/it, loss=2.65]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  72%|███████▏  | 358/500 [13:36<05:22,  2.27s/it, loss=0.905]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  72%|███████▏  | 359/500 [13:39<05:19,  2.27s/it, loss=0.175]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  72%|███████▏  | 360/500 [13:41<05:17,  2.27s/it, loss=1.2]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  72%|███████▏  | 361/500 [13:43<05:16,  2.28s/it, loss=0.147]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all giraffe from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all giraffe from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  72%|███████▏  | 362/500 [13:46<05:14,  2.28s/it, loss=4.99] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all long chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all long chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  73%|███████▎  | 363/500 [13:48<05:11,  2.27s/it, loss=0.807]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all elephant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all elephant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  73%|███████▎  | 364/500 [13:50<05:08,  2.27s/it, loss=2.97] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  73%|███████▎  | 365/500 [13:52<05:06,  2.27s/it, loss=0.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  73%|███████▎  | 366/500 [13:55<05:04,  2.27s/it, loss=0.196]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all horse in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all horse in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  73%|███████▎  | 367/500 [13:57<05:01,  2.27s/it, loss=3.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  74%|███████▎  | 368/500 [13:59<04:59,  2.27s/it, loss=1.7] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all blue potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all blue potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  74%|███████▍  | 369/500 [14:01<04:56,  2.27s/it, loss=1.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  74%|███████▍  | 370/500 [14:04<04:54,  2.27s/it, loss=1.3] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  74%|███████▍  | 371/500 [14:06<04:53,  2.27s/it, loss=1.87]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all traffic light from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all traffic light from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 1:  74%|███████▍  | 372/500 [14:08<04:51,  2.28s/it, loss=0.951]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every orange car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every orange car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  75%|███████▍  | 373/500 [14:11<04:48,  2.27s/it, loss=0.228]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all boat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all boat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  75%|███████▍  | 374/500 [14:13<04:46,  2.27s/it, loss=0.612]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  75%|███████▌  | 375/500 [14:15<04:44,  2.27s/it, loss=0.398]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  75%|███████▌  | 376/500 [14:17<04:41,  2.27s/it, loss=2.75] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all white zebra from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all white zebra from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 1:  75%|███████▌  | 377/500 [14:20<04:39,  2.27s/it, loss=2.34]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  76%|███████▌  | 378/500 [14:22<04:37,  2.27s/it, loss=0.589]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all sink objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all sink objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  76%|███████▌  | 379/500 [14:24<04:35,  2.27s/it, loss=0.948]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every wide banana']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every wide banana']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  76%|███████▌  | 380/500 [14:26<04:32,  2.27s/it, loss=1.32] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  76%|███████▌  | 381/500 [14:29<04:31,  2.28s/it, loss=1.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all purple horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all purple horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  76%|███████▋  | 382/500 [14:31<04:28,  2.28s/it, loss=3.6] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all triangular toilet objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all triangular toilet objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 1:  77%|███████▋  | 383/500 [14:33<04:26,  2.28s/it, loss=0.6]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cow from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cow from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  77%|███████▋  | 384/500 [14:36<04:23,  2.27s/it, loss=2.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  77%|███████▋  | 385/500 [14:38<04:21,  2.28s/it, loss=0.961]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all gray cow in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all gray cow in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  77%|███████▋  | 386/500 [14:40<04:19,  2.28s/it, loss=3.38] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all toothbrush regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all toothbrush regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  77%|███████▋  | 387/500 [14:42<04:17,  2.28s/it, loss=0.117]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  78%|███████▊  | 388/500 [14:45<04:15,  2.28s/it, loss=1.36] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all white horse regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all white horse regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  78%|███████▊  | 389/500 [14:47<04:12,  2.28s/it, loss=6.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cow in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cow in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  78%|███████▊  | 390/500 [14:49<04:10,  2.28s/it, loss=3.03]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all blue toilet objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all blue toilet objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  78%|███████▊  | 391/500 [14:52<04:09,  2.29s/it, loss=1.96]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bottle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bottle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  78%|███████▊  | 392/500 [14:54<04:07,  2.29s/it, loss=0.186]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every suitcase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every suitcase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  79%|███████▊  | 393/500 [14:56<04:05,  2.29s/it, loss=1.78] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  79%|███████▉  | 394/500 [14:58<04:02,  2.29s/it, loss=0.291]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  79%|███████▉  | 395/500 [15:01<04:00,  2.29s/it, loss=3.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  79%|███████▉  | 396/500 [15:03<03:57,  2.29s/it, loss=1.18]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all suitcase regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all suitcase regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  79%|███████▉  | 397/500 [15:05<03:55,  2.28s/it, loss=1.88]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment square cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment square cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  80%|███████▉  | 398/500 [15:08<03:52,  2.28s/it, loss=1.74]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  80%|███████▉  | 399/500 [15:10<03:50,  2.28s/it, loss=0.692]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  80%|████████  | 400/500 [15:12<03:48,  2.28s/it, loss=8.93] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all red airplane regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all red airplane regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  80%|████████  | 401/500 [15:14<03:46,  2.29s/it, loss=3.26]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every brown train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every brown train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  80%|████████  | 402/500 [15:17<03:44,  2.29s/it, loss=4.42]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  81%|████████  | 403/500 [15:19<03:41,  2.29s/it, loss=3.69]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all airplane from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all airplane from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  81%|████████  | 404/500 [15:21<03:39,  2.28s/it, loss=2.62]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bus']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bus']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  81%|████████  | 405/500 [15:24<03:37,  2.29s/it, loss=0.351]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  81%|████████  | 406/500 [15:26<03:34,  2.29s/it, loss=6.1]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  81%|████████▏ | 407/500 [15:28<03:32,  2.29s/it, loss=1.28]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  82%|████████▏ | 408/500 [15:30<03:30,  2.29s/it, loss=3.11]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all book objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all book objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  82%|████████▏ | 409/500 [15:33<03:28,  2.29s/it, loss=0.593]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all silver bench']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all silver bench']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  82%|████████▏ | 410/500 [15:35<03:26,  2.29s/it, loss=1.02] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every umbrella']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every umbrella']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  82%|████████▏ | 411/500 [15:37<03:24,  2.30s/it, loss=0.201]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  82%|████████▏ | 412/500 [15:40<03:22,  2.30s/it, loss=0.737]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  83%|████████▎ | 413/500 [15:42<03:19,  2.30s/it, loss=2.96] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  83%|████████▎ | 414/500 [15:44<03:17,  2.29s/it, loss=2.48]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all cat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all cat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  83%|████████▎ | 415/500 [15:46<03:14,  2.29s/it, loss=1.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all cow objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all cow objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  83%|████████▎ | 416/500 [15:49<03:11,  2.28s/it, loss=1.89]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  83%|████████▎ | 417/500 [15:51<03:09,  2.28s/it, loss=2.75]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  84%|████████▎ | 418/500 [15:53<03:06,  2.28s/it, loss=0.411]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all traffic light objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all traffic light objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  84%|████████▍ | 419/500 [15:56<03:04,  2.28s/it, loss=2.02] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment round dining table']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment round dining table']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  84%|████████▍ | 420/500 [15:58<03:02,  2.28s/it, loss=1.71]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all long umbrella']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all long umbrella']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  84%|████████▍ | 421/500 [16:00<03:00,  2.29s/it, loss=1.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  84%|████████▍ | 422/500 [16:02<02:58,  2.29s/it, loss=3.87]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all round truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all round truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  85%|████████▍ | 423/500 [16:05<02:56,  2.29s/it, loss=0.684]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all circular person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all circular person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hyper

Epoch 1:  85%|████████▍ | 424/500 [16:07<02:53,  2.29s/it, loss=1.7]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  85%|████████▌ | 425/500 [16:09<02:51,  2.29s/it, loss=5.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all surfboard objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all surfboard objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  85%|████████▌ | 426/500 [16:12<02:49,  2.29s/it, loss=0.555]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  85%|████████▌ | 427/500 [16:14<02:46,  2.28s/it, loss=2.25] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  86%|████████▌ | 428/500 [16:16<02:44,  2.28s/it, loss=2.92]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sink regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sink regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  86%|████████▌ | 429/500 [16:18<02:41,  2.28s/it, loss=1.96]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  86%|████████▌ | 430/500 [16:21<02:39,  2.28s/it, loss=0.504]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bicycle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bicycle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  86%|████████▌ | 431/500 [16:23<02:37,  2.29s/it, loss=1.14] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all potted plant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all potted plant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 1:  86%|████████▋ | 432/500 [16:25<02:35,  2.29s/it, loss=2.58]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all motorcycle regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all motorcycle regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  87%|████████▋ | 433/500 [16:28<02:32,  2.28s/it, loss=2.88]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  87%|████████▋ | 434/500 [16:30<02:30,  2.28s/it, loss=1.32]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bottle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bottle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  87%|████████▋ | 435/500 [16:32<02:28,  2.28s/it, loss=3.9] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 1:  87%|████████▋ | 436/500 [16:34<02:25,  2.28s/it, loss=2.97]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all teddy bear regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all teddy bear regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  87%|████████▋ | 437/500 [16:37<02:23,  2.28s/it, loss=0.174]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment round bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment round bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 1:  88%|████████▊ | 438/500 [16:39<02:21,  2.28s/it, loss=1.28] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all traffic light in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all traffic light in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 1:  88%|████████▊ | 439/500 [16:41<02:18,  2.28s/it, loss=0.185]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every wide car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every wide car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  88%|████████▊ | 440/500 [16:43<02:16,  2.28s/it, loss=0.217]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all yellow person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all yellow person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 1:  88%|████████▊ | 441/500 [16:46<02:14,  2.28s/it, loss=0.288]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  88%|████████▊ | 442/500 [16:48<02:12,  2.28s/it, loss=1.25] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bicycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bicycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  89%|████████▊ | 443/500 [16:50<02:10,  2.28s/it, loss=0.449]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all frisbee objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all frisbee objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  89%|████████▉ | 444/500 [16:53<02:07,  2.28s/it, loss=1.57] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bed from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bed from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  89%|████████▉ | 445/500 [16:55<02:05,  2.28s/it, loss=3.26]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  89%|████████▉ | 446/500 [16:57<02:03,  2.28s/it, loss=1.91]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  89%|████████▉ | 447/500 [16:59<02:00,  2.28s/it, loss=0.627]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment fire hydrant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment fire hydrant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  90%|████████▉ | 448/500 [17:02<01:58,  2.28s/it, loss=1.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  90%|████████▉ | 449/500 [17:04<01:56,  2.29s/it, loss=1.06]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all car objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all car objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  90%|█████████ | 450/500 [17:06<01:54,  2.29s/it, loss=0.434]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all small umbrella objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all small umbrella objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  90%|█████████ | 451/500 [17:09<01:52,  2.29s/it, loss=0.217]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment oval chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment oval chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  90%|█████████ | 452/500 [17:11<01:49,  2.29s/it, loss=0.689]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all white sink in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all white sink in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 1:  91%|█████████ | 453/500 [17:13<01:47,  2.29s/it, loss=0.286]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  91%|█████████ | 454/500 [17:16<01:45,  2.29s/it, loss=1.91] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment square bench']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment square bench']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1:  91%|█████████ | 455/500 [17:18<01:42,  2.29s/it, loss=2.41]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  91%|█████████ | 456/500 [17:20<01:40,  2.28s/it, loss=2.48]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all elephant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all elephant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  91%|█████████▏| 457/500 [17:22<01:37,  2.28s/it, loss=0.692]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all large mouse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all large mouse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  92%|█████████▏| 458/500 [17:25<01:35,  2.27s/it, loss=0.187]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  92%|█████████▏| 459/500 [17:27<01:33,  2.27s/it, loss=3.49] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  92%|█████████▏| 460/500 [17:29<01:30,  2.27s/it, loss=2.68]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all chair regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all chair regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  92%|█████████▏| 461/500 [17:31<01:28,  2.28s/it, loss=0.275]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  92%|█████████▏| 462/500 [17:34<01:26,  2.28s/it, loss=1.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  93%|█████████▎| 463/500 [17:36<01:24,  2.28s/it, loss=3.05]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment green airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment green airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  93%|█████████▎| 464/500 [17:38<01:21,  2.28s/it, loss=2.92]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all circular dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all circular dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  93%|█████████▎| 465/500 [17:41<01:19,  2.28s/it, loss=0.496]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bed']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bed']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 1:  93%|█████████▎| 466/500 [17:43<01:17,  2.28s/it, loss=6.82] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  93%|█████████▎| 467/500 [17:45<01:15,  2.28s/it, loss=3.93]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1:  94%|█████████▎| 468/500 [17:47<01:12,  2.28s/it, loss=1.65]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all square giraffe regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all square giraffe regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 1:  94%|█████████▍| 469/500 [17:50<01:10,  2.28s/it, loss=3.5] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 1:  94%|█████████▍| 470/500 [17:52<01:08,  2.28s/it, loss=1.21]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  94%|█████████▍| 471/500 [17:54<01:06,  2.29s/it, loss=0.351]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all toilet objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all toilet objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  94%|█████████▍| 472/500 [17:57<01:03,  2.29s/it, loss=1.2]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all dining table']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all dining table']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  95%|█████████▍| 473/500 [17:59<01:01,  2.28s/it, loss=0.164]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  95%|█████████▍| 474/500 [18:01<00:59,  2.28s/it, loss=0.63] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  95%|█████████▌| 475/500 [18:03<00:57,  2.28s/it, loss=2.21]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every purple toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every purple toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  95%|█████████▌| 476/500 [18:06<00:54,  2.28s/it, loss=1.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all rectangular traffic light from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all rectangular traffic light from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=10

Epoch 1:  95%|█████████▌| 477/500 [18:08<00:52,  2.28s/it, loss=0.802]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  96%|█████████▌| 478/500 [18:10<00:50,  2.28s/it, loss=2.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every laptop']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every laptop']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 1:  96%|█████████▌| 479/500 [18:12<00:47,  2.28s/it, loss=2.96]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 1:  96%|█████████▌| 480/500 [18:15<00:45,  2.28s/it, loss=1.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all dog in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all dog in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 1:  96%|█████████▌| 481/500 [18:17<00:43,  2.29s/it, loss=2.45]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment yellow giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment yellow giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  96%|█████████▋| 482/500 [18:19<00:41,  2.29s/it, loss=1.41]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every silver dining table']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every silver dining table']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  97%|█████████▋| 483/500 [18:22<00:38,  2.29s/it, loss=2.56]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 1:  97%|█████████▋| 484/500 [18:24<00:36,  2.28s/it, loss=0.622]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bus objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bus objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  97%|█████████▋| 485/500 [18:26<00:34,  2.29s/it, loss=0.609]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all horse regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all horse regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  97%|█████████▋| 486/500 [18:28<00:31,  2.28s/it, loss=3.75] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bottle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bottle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  97%|█████████▋| 487/500 [18:31<00:29,  2.28s/it, loss=0.583]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all giraffe objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all giraffe objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1:  98%|█████████▊| 488/500 [18:33<00:27,  2.28s/it, loss=1.08] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all round clock regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all round clock regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 1:  98%|█████████▊| 489/500 [18:35<00:25,  2.28s/it, loss=2.74]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all car objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all car objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 1:  98%|█████████▊| 490/500 [18:38<00:22,  2.28s/it, loss=1.71]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  98%|█████████▊| 491/500 [18:40<00:20,  2.28s/it, loss=2.11]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 1:  98%|█████████▊| 492/500 [18:42<00:18,  2.28s/it, loss=0.412]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all wide frisbee from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all wide frisbee from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 1:  99%|█████████▊| 493/500 [18:44<00:15,  2.28s/it, loss=0.0666]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 1:  99%|█████████▉| 494/500 [18:47<00:13,  2.28s/it, loss=0.906] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 1:  99%|█████████▉| 495/500 [18:49<00:11,  2.28s/it, loss=3.05] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all train in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all train in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 1:  99%|█████████▉| 496/500 [18:51<00:09,  2.28s/it, loss=2.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 1:  99%|█████████▉| 497/500 [18:54<00:06,  2.28s/it, loss=6.07]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 1: 100%|█████████▉| 498/500 [18:56<00:04,  2.28s/it, loss=1.76]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 1: 100%|█████████▉| 499/500 [18:58<00:02,  2.28s/it, loss=1.95]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 1: 100%|██████████| 500/500 [19:00<00:00,  2.28s/it, loss=0.252]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
Epoch 1/2, Average Loss: 1.7129





 ✔️Saved checkpoint to /mnt/data/checkpoints/checkpoint_epoch_0.pth


Epoch 2:   0%|          | 0/500 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sink regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sink regions']


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_B provided → (1, 4, 32)  (numel=128)
  • output_hypernetworks_mlps.1.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.1.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.1.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.1.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.1.layers.2.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.1.lay

Epoch 2:   0%|          | 1/500 [00:02<21:01,  2.53s/it, loss=0.396]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:   0%|          | 2/500 [00:04<19:37,  2.37s/it, loss=1.44] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all backpack objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all backpack objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:   1%|          | 3/500 [00:07<19:08,  2.31s/it, loss=2.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all square tie regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all square tie regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:   1%|          | 4/500 [00:09<18:58,  2.30s/it, loss=3.6] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:   1%|          | 5/500 [00:11<18:54,  2.29s/it, loss=2.3]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all teddy bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all teddy bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:   1%|          | 6/500 [00:13<18:53,  2.29s/it, loss=0.911]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all toilet regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all toilet regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:   1%|▏         | 7/500 [00:16<18:53,  2.30s/it, loss=1.79] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:   2%|▏         | 8/500 [00:18<18:53,  2.30s/it, loss=0.925]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bird in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bird in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:   2%|▏         | 9/500 [00:20<18:53,  2.31s/it, loss=1.58] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:   2%|▏         | 10/500 [00:23<18:52,  2.31s/it, loss=0.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tv in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tv in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:   2%|▏         | 11/500 [00:25<18:55,  2.32s/it, loss=2.02]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all potted plant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all potted plant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:   2%|▏         | 12/500 [00:27<18:53,  2.32s/it, loss=3.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all dog regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all dog regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:   3%|▎         | 13/500 [00:30<18:51,  2.32s/it, loss=2.06]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all suitcase regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all suitcase regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:   3%|▎         | 14/500 [00:32<18:48,  2.32s/it, loss=0.799]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all silver cow objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all silver cow objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:   3%|▎         | 15/500 [00:34<18:44,  2.32s/it, loss=0.914]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment oval toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment oval toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:   3%|▎         | 16/500 [00:37<18:39,  2.31s/it, loss=0.761]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all boat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all boat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:   3%|▎         | 17/500 [00:39<18:35,  2.31s/it, loss=2.02] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tie in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tie in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:   4%|▎         | 18/500 [00:41<18:30,  2.30s/it, loss=0.982]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tv objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tv objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:   4%|▍         | 19/500 [00:43<18:25,  2.30s/it, loss=0.776]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:   4%|▍         | 20/500 [00:46<18:20,  2.29s/it, loss=1.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all motorcycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all motorcycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:   4%|▍         | 21/500 [00:48<18:20,  2.30s/it, loss=0.233]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all red bicycle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all red bicycle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:   4%|▍         | 22/500 [00:50<18:16,  2.29s/it, loss=1.02] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:   5%|▍         | 23/500 [00:53<18:11,  2.29s/it, loss=1.51]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all square car objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all square car objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:   5%|▍         | 24/500 [00:55<18:08,  2.29s/it, loss=0.796]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all elephant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all elephant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:   5%|▌         | 25/500 [00:57<18:04,  2.28s/it, loss=1.2]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:   5%|▌         | 26/500 [00:59<17:58,  2.28s/it, loss=1.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all chair regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all chair regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:   5%|▌         | 27/500 [01:02<17:55,  2.27s/it, loss=1.53]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:   6%|▌         | 28/500 [01:04<17:51,  2.27s/it, loss=0.837]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all truck regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all truck regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:   6%|▌         | 29/500 [01:06<17:47,  2.27s/it, loss=0.426]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all potted plant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all potted plant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:   6%|▌         | 30/500 [01:08<17:43,  2.26s/it, loss=0.485]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all rectangular person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all rectangular person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 2:   6%|▌         | 31/500 [01:11<17:40,  2.26s/it, loss=0.506]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all circular traffic light regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all circular traffic light regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_h

Epoch 2:   6%|▋         | 32/500 [01:13<17:35,  2.25s/it, loss=3.25] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:   7%|▋         | 33/500 [01:15<17:31,  2.25s/it, loss=3.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all silver bird regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all silver bird regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:   7%|▋         | 34/500 [01:17<17:30,  2.25s/it, loss=3.6] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all giraffe in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all giraffe in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:   7%|▋         | 35/500 [01:20<17:26,  2.25s/it, loss=0.91]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment suitcase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment suitcase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:   7%|▋         | 36/500 [01:22<17:24,  2.25s/it, loss=2.21]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bird regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bird regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:   7%|▋         | 37/500 [01:24<17:22,  2.25s/it, loss=1.18]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:   8%|▊         | 38/500 [01:26<17:20,  2.25s/it, loss=0.695]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every microwave']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every microwave']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:   8%|▊         | 39/500 [01:29<17:20,  2.26s/it, loss=0.788]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:   8%|▊         | 40/500 [01:31<17:18,  2.26s/it, loss=2.75] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:   8%|▊         | 41/500 [01:33<17:20,  2.27s/it, loss=0.195]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all donut in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all donut in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:   8%|▊         | 42/500 [01:36<17:17,  2.27s/it, loss=1.48] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment square vase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment square vase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:   9%|▊         | 43/500 [01:38<17:14,  2.26s/it, loss=0.918]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:   9%|▉         | 44/500 [01:40<17:12,  2.27s/it, loss=3.76] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all oven objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all oven objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:   9%|▉         | 45/500 [01:42<17:09,  2.26s/it, loss=0.648]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all zebra in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all zebra in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:   9%|▉         | 46/500 [01:45<17:03,  2.26s/it, loss=0.184]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment silver person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment silver person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:   9%|▉         | 47/500 [01:47<17:01,  2.26s/it, loss=0.682]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all triangular cow objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all triangular cow objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  10%|▉         | 48/500 [01:49<16:58,  2.25s/it, loss=2.04] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  10%|▉         | 49/500 [01:51<16:55,  2.25s/it, loss=0.824]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  10%|█         | 50/500 [01:54<16:53,  2.25s/it, loss=1.06] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  10%|█         | 51/500 [01:56<16:53,  2.26s/it, loss=5.62]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  10%|█         | 52/500 [01:58<16:52,  2.26s/it, loss=0.331]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  11%|█         | 53/500 [02:00<16:49,  2.26s/it, loss=3.46] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all toothbrush in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all toothbrush in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  11%|█         | 54/500 [02:03<16:48,  2.26s/it, loss=0.502]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment triangular person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment triangular person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  11%|█         | 55/500 [02:05<16:47,  2.26s/it, loss=0.344]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all green bottle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all green bottle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  11%|█         | 56/500 [02:07<16:45,  2.26s/it, loss=0.23] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all black elephant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all black elephant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  11%|█▏        | 57/500 [02:09<16:43,  2.27s/it, loss=0.25]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 2:  12%|█▏        | 58/500 [02:12<16:40,  2.26s/it, loss=1.32]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  12%|█▏        | 59/500 [02:14<16:38,  2.26s/it, loss=6.54]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bicycle regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bicycle regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  12%|█▏        | 60/500 [02:16<16:36,  2.26s/it, loss=1.6] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  12%|█▏        | 61/500 [02:19<16:37,  2.27s/it, loss=3.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all couch in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all couch in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  12%|█▏        | 62/500 [02:21<16:34,  2.27s/it, loss=3.07]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment square bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment square bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  13%|█▎        | 63/500 [02:23<16:30,  2.27s/it, loss=2.84]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  13%|█▎        | 64/500 [02:25<16:27,  2.26s/it, loss=0.278]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment triangular giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment triangular giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  13%|█▎        | 65/500 [02:28<16:25,  2.27s/it, loss=0.73] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 2:  13%|█▎        | 66/500 [02:30<16:23,  2.27s/it, loss=0.462]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all elephant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all elephant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  13%|█▎        | 67/500 [02:32<16:21,  2.27s/it, loss=1.84] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all fire hydrant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all fire hydrant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  14%|█▎        | 68/500 [02:34<16:19,  2.27s/it, loss=2.43]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all motorcycle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all motorcycle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  14%|█▍        | 69/500 [02:37<16:17,  2.27s/it, loss=1.96]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  14%|█▍        | 70/500 [02:39<16:14,  2.27s/it, loss=3.89]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all long person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all long person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  14%|█▍        | 71/500 [02:41<16:15,  2.27s/it, loss=0.769]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  14%|█▍        | 72/500 [02:43<16:14,  2.28s/it, loss=0.48] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  15%|█▍        | 73/500 [02:46<16:10,  2.27s/it, loss=4.72]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tall skateboard objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tall skateboard objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  15%|█▍        | 74/500 [02:48<16:06,  2.27s/it, loss=0.127]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment orange boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment orange boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  15%|█▌        | 75/500 [02:50<16:04,  2.27s/it, loss=5.34] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  15%|█▌        | 76/500 [02:53<16:01,  2.27s/it, loss=3.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  15%|█▌        | 77/500 [02:55<15:59,  2.27s/it, loss=1.84]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  16%|█▌        | 78/500 [02:57<15:58,  2.27s/it, loss=2.19]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  16%|█▌        | 79/500 [02:59<15:54,  2.27s/it, loss=2.76]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all traffic light from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all traffic light from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 2:  16%|█▌        | 80/500 [03:02<15:51,  2.27s/it, loss=0.467]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all dog in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all dog in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  16%|█▌        | 81/500 [03:04<15:52,  2.27s/it, loss=1.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all mouse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all mouse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  16%|█▋        | 82/500 [03:06<15:52,  2.28s/it, loss=0.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all small car regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all small car regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  17%|█▋        | 83/500 [03:08<15:49,  2.28s/it, loss=0.666]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all brown cat objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all brown cat objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  17%|█▋        | 84/500 [03:11<15:47,  2.28s/it, loss=1.86] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  17%|█▋        | 85/500 [03:13<15:45,  2.28s/it, loss=0.766]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment dining table']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment dining table']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  17%|█▋        | 86/500 [03:15<15:43,  2.28s/it, loss=7.79] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  17%|█▋        | 87/500 [03:18<15:38,  2.27s/it, loss=2.92]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every oval giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every oval giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  18%|█▊        | 88/500 [03:20<15:36,  2.27s/it, loss=2.32]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  18%|█▊        | 89/500 [03:22<15:34,  2.27s/it, loss=0.438]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  18%|█▊        | 90/500 [03:24<15:32,  2.27s/it, loss=1.39] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  18%|█▊        | 91/500 [03:27<15:33,  2.28s/it, loss=0.394]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every triangular giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every triangular giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  18%|█▊        | 92/500 [03:29<15:30,  2.28s/it, loss=2.06] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  19%|█▊        | 93/500 [03:31<15:27,  2.28s/it, loss=1.95]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all gray clock objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all gray clock objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  19%|█▉        | 94/500 [03:34<15:24,  2.28s/it, loss=1.42]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment vase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment vase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  19%|█▉        | 95/500 [03:36<15:23,  2.28s/it, loss=0.589]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all remote regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all remote regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  19%|█▉        | 96/500 [03:38<15:21,  2.28s/it, loss=0.756]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  19%|█▉        | 97/500 [03:40<15:18,  2.28s/it, loss=3.05] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all black person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all black person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  20%|█▉        | 98/500 [03:43<15:15,  2.28s/it, loss=1.28]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  20%|█▉        | 99/500 [03:45<15:13,  2.28s/it, loss=0.768]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  20%|██        | 100/500 [03:47<15:11,  2.28s/it, loss=3.52]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all elephant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all elephant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  20%|██        | 101/500 [03:49<15:10,  2.28s/it, loss=0.468]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  20%|██        | 102/500 [03:52<15:07,  2.28s/it, loss=1.79] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every handbag']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every handbag']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  21%|██        | 103/500 [03:54<15:05,  2.28s/it, loss=0.561]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  21%|██        | 104/500 [03:56<15:02,  2.28s/it, loss=3.84] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  21%|██        | 105/500 [03:59<14:58,  2.28s/it, loss=4.98]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  21%|██        | 106/500 [04:01<14:57,  2.28s/it, loss=0.396]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cup in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cup in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  21%|██▏       | 107/500 [04:03<14:54,  2.28s/it, loss=0.446]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment red giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment red giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  22%|██▏       | 108/500 [04:05<14:52,  2.28s/it, loss=0.978]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  22%|██▏       | 109/500 [04:08<14:49,  2.28s/it, loss=1.83] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bench from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bench from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  22%|██▏       | 110/500 [04:10<14:47,  2.28s/it, loss=0.465]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment orange zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment orange zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  22%|██▏       | 111/500 [04:12<14:48,  2.28s/it, loss=3.5]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  22%|██▏       | 112/500 [04:15<14:45,  2.28s/it, loss=0.276]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  23%|██▎       | 113/500 [04:17<14:41,  2.28s/it, loss=4.99] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bottle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bottle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  23%|██▎       | 114/500 [04:19<14:38,  2.28s/it, loss=2.6] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all black microwave in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all black microwave in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  23%|██▎       | 115/500 [04:21<14:36,  2.28s/it, loss=0.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  23%|██▎       | 116/500 [04:24<14:34,  2.28s/it, loss=0.608]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all car regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all car regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  23%|██▎       | 117/500 [04:26<14:31,  2.27s/it, loss=2.97] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  24%|██▎       | 118/500 [04:28<14:28,  2.27s/it, loss=0.702]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all handbag regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all handbag regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  24%|██▍       | 119/500 [04:30<14:27,  2.28s/it, loss=0.262]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all small frisbee in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all small frisbee in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  24%|██▍       | 120/500 [04:33<14:24,  2.28s/it, loss=1.01] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all elephant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all elephant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  24%|██▍       | 121/500 [04:35<14:26,  2.29s/it, loss=5.68]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  24%|██▍       | 122/500 [04:37<14:24,  2.29s/it, loss=1.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment oval bicycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment oval bicycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  25%|██▍       | 123/500 [04:40<14:21,  2.28s/it, loss=0.456]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every brown person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every brown person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  25%|██▍       | 124/500 [04:42<14:18,  2.28s/it, loss=0.525]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  25%|██▌       | 125/500 [04:44<14:15,  2.28s/it, loss=3.08] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all elephant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all elephant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  25%|██▌       | 126/500 [04:46<14:12,  2.28s/it, loss=1.07]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all book objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all book objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  25%|██▌       | 127/500 [04:49<14:09,  2.28s/it, loss=0.531]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every tall horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every tall horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  26%|██▌       | 128/500 [04:51<14:07,  2.28s/it, loss=2.81] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all wide horse regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all wide horse regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  26%|██▌       | 129/500 [04:53<14:04,  2.28s/it, loss=0.519]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  26%|██▌       | 130/500 [04:56<14:01,  2.27s/it, loss=3.2]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment red potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment red potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  26%|██▌       | 131/500 [04:58<14:00,  2.28s/it, loss=3.67]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all brown umbrella regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all brown umbrella regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  26%|██▋       | 132/500 [05:00<13:58,  2.28s/it, loss=0.935]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 2:  27%|██▋       | 133/500 [05:02<13:57,  2.28s/it, loss=1.69] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  27%|██▋       | 134/500 [05:05<13:55,  2.28s/it, loss=2.03]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  27%|██▋       | 135/500 [05:07<13:50,  2.28s/it, loss=2.8] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every red potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every red potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  27%|██▋       | 136/500 [05:09<13:49,  2.28s/it, loss=0.492]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all wide sink from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all wide sink from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  27%|██▋       | 137/500 [05:11<13:46,  2.28s/it, loss=0.255]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  28%|██▊       | 138/500 [05:14<13:43,  2.28s/it, loss=0.134]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment handbag']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment handbag']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  28%|██▊       | 139/500 [05:16<13:41,  2.28s/it, loss=0.28] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all airplane regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all airplane regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  28%|██▊       | 140/500 [05:18<13:39,  2.28s/it, loss=3.42]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  28%|██▊       | 141/500 [05:21<13:39,  2.28s/it, loss=1.44]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)  

Epoch 2:  28%|██▊       | 142/500 [05:23<13:37,  2.28s/it, loss=2]   

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bird regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bird regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  29%|██▊       | 143/500 [05:25<13:33,  2.28s/it, loss=1.52]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bicycle objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bicycle objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  29%|██▉       | 144/500 [05:27<13:31,  2.28s/it, loss=0.573]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all orange motorcycle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all orange motorcycle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  29%|██▉       | 145/500 [05:30<13:26,  2.27s/it, loss=0.238]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all wide zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all wide zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  29%|██▉       | 146/500 [05:32<13:23,  2.27s/it, loss=1.96] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bicycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bicycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  29%|██▉       | 147/500 [05:34<13:21,  2.27s/it, loss=1.32]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all fire hydrant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all fire hydrant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  30%|██▉       | 148/500 [05:37<13:19,  2.27s/it, loss=1.18]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bench objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bench objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  30%|██▉       | 149/500 [05:39<13:17,  2.27s/it, loss=0.608]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  30%|███       | 150/500 [05:41<13:15,  2.27s/it, loss=1.07] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all cup regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all cup regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  30%|███       | 151/500 [05:43<13:14,  2.28s/it, loss=1.86]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment frisbee']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment frisbee']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  30%|███       | 152/500 [05:46<13:12,  2.28s/it, loss=0.851]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment long tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment long tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  31%|███       | 153/500 [05:48<13:10,  2.28s/it, loss=2.93] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  31%|███       | 154/500 [05:50<13:07,  2.28s/it, loss=2.53]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all triangular giraffe from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all triangular giraffe from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output

Epoch 2:  31%|███       | 155/500 [05:52<13:05,  2.28s/it, loss=2.16]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all boat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all boat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  31%|███       | 156/500 [05:55<13:03,  2.28s/it, loss=2.37]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all car regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all car regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  31%|███▏      | 157/500 [05:57<13:00,  2.28s/it, loss=1.58]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  32%|███▏      | 158/500 [05:59<12:57,  2.27s/it, loss=1.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all sink objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all sink objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  32%|███▏      | 159/500 [06:02<12:54,  2.27s/it, loss=0.493]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all potted plant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all potted plant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  32%|███▏      | 160/500 [06:04<12:52,  2.27s/it, loss=0.525]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  32%|███▏      | 161/500 [06:06<12:53,  2.28s/it, loss=0.849]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  32%|███▏      | 162/500 [06:08<12:51,  2.28s/it, loss=1.23] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  33%|███▎      | 163/500 [06:11<12:47,  2.28s/it, loss=1.8] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment white remote']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment white remote']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  33%|███▎      | 164/500 [06:13<12:45,  2.28s/it, loss=1.25]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all sink objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all sink objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  33%|███▎      | 165/500 [06:15<12:41,  2.27s/it, loss=2.16]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  33%|███▎      | 166/500 [06:18<12:40,  2.28s/it, loss=1.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  33%|███▎      | 167/500 [06:20<12:38,  2.28s/it, loss=0.534]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment round elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment round elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  34%|███▎      | 168/500 [06:22<12:35,  2.28s/it, loss=1.25] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all fire hydrant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all fire hydrant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  34%|███▍      | 169/500 [06:24<12:31,  2.27s/it, loss=0.412]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  34%|███▍      | 170/500 [06:27<12:29,  2.27s/it, loss=0.381]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bowl regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bowl regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  34%|███▍      | 171/500 [06:29<12:29,  2.28s/it, loss=0.844]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all dining table objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all dining table objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  34%|███▍      | 172/500 [06:31<12:27,  2.28s/it, loss=0.859]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment circular keyboard']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment circular keyboard']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 2:  35%|███▍      | 173/500 [06:33<12:24,  2.28s/it, loss=2.77] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bottle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bottle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  35%|███▍      | 174/500 [06:36<12:22,  2.28s/it, loss=3.67]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all toilet regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all toilet regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  35%|███▌      | 175/500 [06:38<12:19,  2.28s/it, loss=0.713]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all giraffe in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all giraffe in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  35%|███▌      | 176/500 [06:40<12:18,  2.28s/it, loss=1.35] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all backpack in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all backpack in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  35%|███▌      | 177/500 [06:43<12:16,  2.28s/it, loss=0.629]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment red person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment red person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  36%|███▌      | 178/500 [06:45<12:13,  2.28s/it, loss=3.04] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment motorcycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment motorcycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  36%|███▌      | 179/500 [06:47<12:11,  2.28s/it, loss=0.404]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every clock']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every clock']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  36%|███▌      | 180/500 [06:49<12:08,  2.28s/it, loss=0.515]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every pink person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every pink person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  36%|███▌      | 181/500 [06:52<12:08,  2.28s/it, loss=2.29] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  36%|███▋      | 182/500 [06:54<12:06,  2.28s/it, loss=2.72]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all silver motorcycle from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all silver motorcycle from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_h

Epoch 2:  37%|███▋      | 183/500 [06:56<12:03,  2.28s/it, loss=0.676]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  37%|███▋      | 184/500 [06:59<11:59,  2.28s/it, loss=2.12] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  37%|███▋      | 185/500 [07:01<11:57,  2.28s/it, loss=0.196]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  37%|███▋      | 186/500 [07:03<11:55,  2.28s/it, loss=1.88] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tall bird in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tall bird in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  37%|███▋      | 187/500 [07:05<11:53,  2.28s/it, loss=0.646]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bicycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bicycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  38%|███▊      | 188/500 [07:08<11:51,  2.28s/it, loss=0.25] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every orange vase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every orange vase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  38%|███▊      | 189/500 [07:10<11:49,  2.28s/it, loss=0.284]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cup from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cup from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  38%|███▊      | 190/500 [07:12<11:47,  2.28s/it, loss=0.685]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bench objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bench objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  38%|███▊      | 191/500 [07:14<11:46,  2.29s/it, loss=0.995]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all rectangular person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all rectangular person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetw

Epoch 2:  38%|███▊      | 192/500 [07:17<11:45,  2.29s/it, loss=0.848]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all laptop in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all laptop in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  39%|███▊      | 193/500 [07:19<11:42,  2.29s/it, loss=2.35] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  39%|███▉      | 194/500 [07:21<11:39,  2.29s/it, loss=6.81]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  39%|███▉      | 195/500 [07:24<11:35,  2.28s/it, loss=3.14]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tall stop sign objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tall stop sign objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  39%|███▉      | 196/500 [07:26<11:33,  2.28s/it, loss=3.57]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all blue sheep from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all blue sheep from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  39%|███▉      | 197/500 [07:28<11:31,  2.28s/it, loss=3.29]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all elephant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all elephant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  40%|███▉      | 198/500 [07:30<11:28,  2.28s/it, loss=4.96]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 2:  40%|███▉      | 199/500 [07:33<11:26,  2.28s/it, loss=2.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  40%|████      | 200/500 [07:35<11:24,  2.28s/it, loss=0.195]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  40%|████      | 201/500 [07:37<11:24,  2.29s/it, loss=0.5]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment black person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment black person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  40%|████      | 202/500 [07:40<11:22,  2.29s/it, loss=1.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment rectangular person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment rectangular person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  41%|████      | 203/500 [07:42<11:19,  2.29s/it, loss=1.65]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment thin zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment thin zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  41%|████      | 204/500 [07:44<11:16,  2.29s/it, loss=3.18]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment purple horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment purple horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  41%|████      | 205/500 [07:46<11:12,  2.28s/it, loss=1.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all chair in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all chair in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  41%|████      | 206/500 [07:49<11:11,  2.28s/it, loss=1.18]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all round boat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all round boat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  41%|████▏     | 207/500 [07:51<11:07,  2.28s/it, loss=2.49]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  42%|████▏     | 208/500 [07:53<11:04,  2.28s/it, loss=2.86]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bench']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bench']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  42%|████▏     | 209/500 [07:56<11:03,  2.28s/it, loss=0.309]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all black tv objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all black tv objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  42%|████▏     | 210/500 [07:58<11:00,  2.28s/it, loss=0.507]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tv objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tv objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  42%|████▏     | 211/500 [08:00<10:59,  2.28s/it, loss=0.478]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all blue tie']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all blue tie']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  42%|████▏     | 212/500 [08:02<10:58,  2.28s/it, loss=0.61] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment green person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment green person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  43%|████▎     | 213/500 [08:05<10:54,  2.28s/it, loss=1.39]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  43%|████▎     | 214/500 [08:07<10:51,  2.28s/it, loss=4.59]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  43%|████▎     | 215/500 [08:09<10:48,  2.28s/it, loss=4.76]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all car regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all car regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  43%|████▎     | 216/500 [08:12<10:45,  2.27s/it, loss=2.93]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  43%|████▎     | 217/500 [08:14<10:43,  2.27s/it, loss=1.37]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all book in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all book in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  44%|████▎     | 218/500 [08:16<10:41,  2.27s/it, loss=0.725]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  44%|████▍     | 219/500 [08:18<10:38,  2.27s/it, loss=1.91] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bench in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bench in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  44%|████▍     | 220/500 [08:21<10:36,  2.27s/it, loss=0.861]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  44%|████▍     | 221/500 [08:23<10:35,  2.28s/it, loss=1.96] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all suitcase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all suitcase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  44%|████▍     | 222/500 [08:25<10:33,  2.28s/it, loss=1.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  45%|████▍     | 223/500 [08:27<10:29,  2.27s/it, loss=0.649]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all dog regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all dog regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  45%|████▍     | 224/500 [08:30<10:26,  2.27s/it, loss=1.84] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  45%|████▌     | 225/500 [08:32<10:24,  2.27s/it, loss=2.28]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment red broccoli']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment red broccoli']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  45%|████▌     | 226/500 [08:34<10:22,  2.27s/it, loss=1.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all orange umbrella objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all orange umbrella objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  45%|████▌     | 227/500 [08:37<10:19,  2.27s/it, loss=1.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all sink in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all sink in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  46%|████▌     | 228/500 [08:39<10:17,  2.27s/it, loss=0.502]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all round bird in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all round bird in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  46%|████▌     | 229/500 [08:41<10:15,  2.27s/it, loss=3.24] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all elephant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all elephant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  46%|████▌     | 230/500 [08:43<10:12,  2.27s/it, loss=2.2] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment handbag']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment handbag']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  46%|████▌     | 231/500 [08:46<10:12,  2.28s/it, loss=0.652]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tie in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tie in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  46%|████▋     | 232/500 [08:48<10:09,  2.27s/it, loss=2.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  47%|████▋     | 233/500 [08:50<10:06,  2.27s/it, loss=3.58]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all bottle objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all bottle objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  47%|████▋     | 234/500 [08:52<10:04,  2.27s/it, loss=0.823]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all orange boat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all orange boat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 2:  47%|████▋     | 235/500 [08:55<10:02,  2.28s/it, loss=1.71] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all circular person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all circular person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  47%|████▋     | 236/500 [08:57<09:59,  2.27s/it, loss=2.15]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all elephant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all elephant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  47%|████▋     | 237/500 [08:59<09:56,  2.27s/it, loss=3.57]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all dining table regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all dining table regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  48%|████▊     | 238/500 [09:02<09:55,  2.27s/it, loss=0.116]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every brown dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every brown dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  48%|████▊     | 239/500 [09:04<09:52,  2.27s/it, loss=0.913]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every tall elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every tall elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  48%|████▊     | 240/500 [09:06<09:50,  2.27s/it, loss=6.36] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  48%|████▊     | 241/500 [09:08<09:50,  2.28s/it, loss=0.246]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  48%|████▊     | 242/500 [09:11<09:48,  2.28s/it, loss=2.08] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all large horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all large horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 2:  49%|████▊     | 243/500 [09:13<09:45,  2.28s/it, loss=1.29]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  49%|████▉     | 244/500 [09:15<09:42,  2.28s/it, loss=1.43]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  49%|████▉     | 245/500 [09:17<09:40,  2.28s/it, loss=0.873]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bench from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bench from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  49%|████▉     | 246/500 [09:20<09:37,  2.27s/it, loss=2.04] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment yellow cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment yellow cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  49%|████▉     | 247/500 [09:22<09:35,  2.27s/it, loss=3.43]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment umbrella']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment umbrella']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  50%|████▉     | 248/500 [09:24<09:32,  2.27s/it, loss=0.426]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all round bus regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all round bus regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  50%|████▉     | 249/500 [09:27<09:29,  2.27s/it, loss=4.62] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all sink objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all sink objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  50%|█████     | 250/500 [09:29<09:28,  2.27s/it, loss=1.9] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment square suitcase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment square suitcase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  50%|█████     | 251/500 [09:31<09:27,  2.28s/it, loss=0.803]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  50%|█████     | 252/500 [09:33<09:24,  2.28s/it, loss=2.07] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  51%|█████     | 253/500 [09:36<09:22,  2.28s/it, loss=0.807]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all silver boat objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all silver boat objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  51%|█████     | 254/500 [09:38<09:19,  2.28s/it, loss=4.52] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all blue dog in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all blue dog in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  51%|█████     | 255/500 [09:40<09:16,  2.27s/it, loss=3.02]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all rectangular boat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all rectangular boat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 2:  51%|█████     | 256/500 [09:42<09:13,  2.27s/it, loss=1.42]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  51%|█████▏    | 257/500 [09:45<09:11,  2.27s/it, loss=0.636]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all laptop regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all laptop regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  52%|█████▏    | 258/500 [09:47<09:08,  2.27s/it, loss=2.87] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  52%|█████▏    | 259/500 [09:49<09:06,  2.27s/it, loss=4.26]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  52%|█████▏    | 260/500 [09:52<09:04,  2.27s/it, loss=0.147]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  52%|█████▏    | 261/500 [09:54<09:04,  2.28s/it, loss=0.0714]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment circular elephant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment circular elephant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 2:  52%|█████▏    | 262/500 [09:56<09:01,  2.27s/it, loss=0.315] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all large bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all large bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  53%|█████▎    | 263/500 [09:58<08:58,  2.27s/it, loss=1.33] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all orange spoon']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all orange spoon']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  53%|█████▎    | 264/500 [10:01<08:56,  2.27s/it, loss=0.733]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all wide parking meter objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all wide parking meter objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  53%|█████▎    | 265/500 [10:03<08:53,  2.27s/it, loss=2.37] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  53%|█████▎    | 266/500 [10:05<08:50,  2.27s/it, loss=2.15]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all traffic light in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all traffic light in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  53%|█████▎    | 267/500 [10:07<08:48,  2.27s/it, loss=0.45]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  54%|█████▎    | 268/500 [10:10<08:46,  2.27s/it, loss=0.477]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all black cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all black cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  54%|█████▍    | 269/500 [10:12<08:43,  2.27s/it, loss=1.95] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all square handbag in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all square handbag in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  54%|█████▍    | 270/500 [10:14<08:41,  2.27s/it, loss=0.35]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment frisbee']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment frisbee']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  54%|█████▍    | 271/500 [10:17<08:41,  2.28s/it, loss=0.719]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all boat objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all boat objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  54%|█████▍    | 272/500 [10:19<08:38,  2.28s/it, loss=2]    

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  55%|█████▍    | 273/500 [10:21<08:35,  2.27s/it, loss=1.38]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bus']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bus']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  55%|█████▍    | 274/500 [10:23<08:33,  2.27s/it, loss=1.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  55%|█████▌    | 275/500 [10:26<08:29,  2.27s/it, loss=3.69]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment tie']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment tie']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  55%|█████▌    | 276/500 [10:28<08:27,  2.27s/it, loss=2.38]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all train in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all train in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  55%|█████▌    | 277/500 [10:30<08:25,  2.27s/it, loss=3.37]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  56%|█████▌    | 278/500 [10:32<08:24,  2.27s/it, loss=2.3] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  56%|█████▌    | 279/500 [10:35<08:22,  2.27s/it, loss=0.735]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment orange']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment orange']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  56%|█████▌    | 280/500 [10:37<08:19,  2.27s/it, loss=0.895]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  56%|█████▌    | 281/500 [10:39<08:18,  2.28s/it, loss=3.29] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  56%|█████▋    | 282/500 [10:41<08:16,  2.28s/it, loss=1.32]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all cat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all cat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  57%|█████▋    | 283/500 [10:44<08:13,  2.27s/it, loss=0.298]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all dog objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all dog objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  57%|█████▋    | 284/500 [10:46<08:10,  2.27s/it, loss=2.91] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  57%|█████▋    | 285/500 [10:48<08:07,  2.27s/it, loss=1.29]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bottle regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bottle regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  57%|█████▋    | 286/500 [10:51<08:05,  2.27s/it, loss=0.058]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all sheep in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all sheep in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  57%|█████▋    | 287/500 [10:53<08:02,  2.27s/it, loss=1.32] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment cell phone']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment cell phone']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  58%|█████▊    | 288/500 [10:55<08:00,  2.27s/it, loss=0.189]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all motorcycle regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all motorcycle regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  58%|█████▊    | 289/500 [10:57<07:58,  2.27s/it, loss=3.78] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all bird from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all bird from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  58%|█████▊    | 290/500 [11:00<07:55,  2.26s/it, loss=0.879]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cat in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cat in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  58%|█████▊    | 291/500 [11:02<07:54,  2.27s/it, loss=2.26] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bicycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bicycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  58%|█████▊    | 292/500 [11:04<07:52,  2.27s/it, loss=0.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tv objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tv objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  59%|█████▊    | 293/500 [11:06<07:50,  2.27s/it, loss=1.9] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all circular dog regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all circular dog regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  59%|█████▉    | 294/500 [11:09<07:47,  2.27s/it, loss=6.4]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all blue person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all blue person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  59%|█████▉    | 295/500 [11:11<07:44,  2.27s/it, loss=1]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  59%|█████▉    | 296/500 [11:13<07:42,  2.27s/it, loss=0.446]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all round car objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all round car objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  59%|█████▉    | 297/500 [11:15<07:40,  2.27s/it, loss=0.433]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every blue banana']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every blue banana']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  60%|█████▉    | 298/500 [11:18<07:38,  2.27s/it, loss=2.79] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  60%|█████▉    | 299/500 [11:20<07:36,  2.27s/it, loss=2.63]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all suitcase regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all suitcase regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  60%|██████    | 300/500 [11:22<07:34,  2.27s/it, loss=3.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  60%|██████    | 301/500 [11:25<07:33,  2.28s/it, loss=0.223]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all zebra in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all zebra in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  60%|██████    | 302/500 [11:27<07:30,  2.28s/it, loss=0.579]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  61%|██████    | 303/500 [11:29<07:28,  2.27s/it, loss=3.05] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  61%|██████    | 304/500 [11:31<07:25,  2.27s/it, loss=0.572]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all horse regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all horse regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  61%|██████    | 305/500 [11:34<07:22,  2.27s/it, loss=3.85] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment circular person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment circular person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  61%|██████    | 306/500 [11:36<07:20,  2.27s/it, loss=0.897]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)  

Epoch 2:  61%|██████▏   | 307/500 [11:38<07:17,  2.27s/it, loss=1.03] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  62%|██████▏   | 308/500 [11:40<07:15,  2.27s/it, loss=0.454]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all elephant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all elephant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  62%|██████▏   | 309/500 [11:43<07:13,  2.27s/it, loss=0.639]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all wide toilet in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all wide toilet in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  62%|██████▏   | 310/500 [11:45<07:11,  2.27s/it, loss=1.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  62%|██████▏   | 311/500 [11:47<07:11,  2.28s/it, loss=0.956]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all square person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all square person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  62%|██████▏   | 312/500 [11:50<07:09,  2.28s/it, loss=1.86] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all round person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all round person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  63%|██████▎   | 313/500 [11:52<07:06,  2.28s/it, loss=1.58]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment frisbee']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment frisbee']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  63%|██████▎   | 314/500 [11:54<07:03,  2.28s/it, loss=0.0987]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all oval chair']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all oval chair']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  63%|██████▎   | 315/500 [11:56<07:00,  2.28s/it, loss=1.03]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment wide person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment wide person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  63%|██████▎   | 316/500 [11:59<06:58,  2.27s/it, loss=1.48]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  63%|██████▎   | 317/500 [12:01<06:56,  2.27s/it, loss=0.496]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment bottle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment bottle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  64%|██████▎   | 318/500 [12:03<06:53,  2.27s/it, loss=0.0477]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment boat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment boat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  64%|██████▍   | 319/500 [12:06<06:51,  2.27s/it, loss=0.602] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all boat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all boat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  64%|██████▍   | 320/500 [12:08<06:49,  2.27s/it, loss=3.06] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all cow in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all cow in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  64%|██████▍   | 321/500 [12:10<06:48,  2.28s/it, loss=2.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  64%|██████▍   | 322/500 [12:12<06:47,  2.29s/it, loss=0.818]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all orange giraffe regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all orange giraffe regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  65%|██████▍   | 323/500 [12:15<06:44,  2.29s/it, loss=2.34] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all pink bowl']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all pink bowl']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  65%|██████▍   | 324/500 [12:17<06:42,  2.28s/it, loss=0.0999]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  65%|██████▌   | 325/500 [12:19<06:39,  2.28s/it, loss=0.194] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all red elephant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all red elephant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  65%|██████▌   | 326/500 [12:22<06:37,  2.28s/it, loss=0.254]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  65%|██████▌   | 327/500 [12:24<06:34,  2.28s/it, loss=1.32] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all small horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all small horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  66%|██████▌   | 328/500 [12:26<06:32,  2.28s/it, loss=1.23]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  66%|██████▌   | 329/500 [12:28<06:29,  2.28s/it, loss=0.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  66%|██████▌   | 330/500 [12:31<06:27,  2.28s/it, loss=3.04]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  66%|██████▌   | 331/500 [12:33<06:27,  2.29s/it, loss=3.16]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  66%|██████▋   | 332/500 [12:35<06:24,  2.29s/it, loss=2.88]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all truck regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all truck regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  67%|██████▋   | 333/500 [12:38<06:22,  2.29s/it, loss=0.859]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment thin giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment thin giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  67%|██████▋   | 334/500 [12:40<06:19,  2.29s/it, loss=3]    

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all yellow cat regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all yellow cat regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  67%|██████▋   | 335/500 [12:42<06:17,  2.29s/it, loss=0.685]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every brown person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every brown person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  67%|██████▋   | 336/500 [12:44<06:15,  2.29s/it, loss=0.389]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all green bicycle']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all green bicycle']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  67%|██████▋   | 337/500 [12:47<06:12,  2.29s/it, loss=1.45] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all stop sign in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all stop sign in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  68%|██████▊   | 338/500 [12:49<06:10,  2.29s/it, loss=0.389]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  68%|██████▊   | 339/500 [12:51<06:08,  2.29s/it, loss=0.874]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  68%|██████▊   | 340/500 [12:54<06:05,  2.29s/it, loss=0.76] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  68%|██████▊   | 341/500 [12:56<06:04,  2.29s/it, loss=1.82]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  68%|██████▊   | 342/500 [12:58<06:02,  2.29s/it, loss=0.298]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all fire hydrant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all fire hydrant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  69%|██████▊   | 343/500 [13:00<05:59,  2.29s/it, loss=1.55] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  69%|██████▉   | 344/500 [13:03<05:56,  2.29s/it, loss=0.308]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all car objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all car objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  69%|██████▉   | 345/500 [13:05<05:54,  2.29s/it, loss=1.61] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment white person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment white person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  69%|██████▉   | 346/500 [13:07<05:51,  2.28s/it, loss=3.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all traffic light in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all traffic light in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  69%|██████▉   | 347/500 [13:10<05:49,  2.28s/it, loss=2.17]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  70%|██████▉   | 348/500 [13:12<05:46,  2.28s/it, loss=1.04]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all horse regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all horse regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  70%|██████▉   | 349/500 [13:14<05:44,  2.28s/it, loss=3.52]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment banana']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment banana']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  70%|███████   | 350/500 [13:16<05:42,  2.28s/it, loss=1.64]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  70%|███████   | 351/500 [13:19<05:40,  2.29s/it, loss=0.623]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  70%|███████   | 352/500 [13:21<05:38,  2.28s/it, loss=1.62] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  71%|███████   | 353/500 [13:23<05:35,  2.28s/it, loss=3.69]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment oval truck']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment oval truck']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  71%|███████   | 354/500 [13:26<05:33,  2.28s/it, loss=3.75]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  71%|███████   | 355/500 [13:28<05:30,  2.28s/it, loss=0.287]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  71%|███████   | 356/500 [13:30<05:28,  2.28s/it, loss=0.406]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sheep regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sheep regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  71%|███████▏  | 357/500 [13:32<05:25,  2.28s/it, loss=1.51] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  72%|███████▏  | 358/500 [13:35<05:23,  2.28s/it, loss=4.9] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  72%|███████▏  | 359/500 [13:37<05:21,  2.28s/it, loss=1.1]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all square car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all square car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  72%|███████▏  | 360/500 [13:39<05:19,  2.28s/it, loss=2.52]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all elephant objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all elephant objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  72%|███████▏  | 361/500 [13:42<05:17,  2.29s/it, loss=0.667]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every red sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every red sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provid

Epoch 2:  72%|███████▏  | 362/500 [13:44<05:15,  2.28s/it, loss=1.42] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all umbrella in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all umbrella in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  73%|███████▎  | 363/500 [13:46<05:12,  2.28s/it, loss=1.77]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  73%|███████▎  | 364/500 [13:48<05:09,  2.28s/it, loss=1.92]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all tv']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all tv']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)  

Epoch 2:  73%|███████▎  | 365/500 [13:51<05:07,  2.28s/it, loss=1.17]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all elephant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all elephant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  73%|███████▎  | 366/500 [13:53<05:05,  2.28s/it, loss=2.56]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  73%|███████▎  | 367/500 [13:55<05:02,  2.27s/it, loss=1.27]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all train objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all train objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  74%|███████▎  | 368/500 [13:57<05:00,  2.27s/it, loss=3.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment tall dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment tall dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  74%|███████▍  | 369/500 [14:00<04:58,  2.27s/it, loss=5.82]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment banana']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment banana']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  74%|███████▍  | 370/500 [14:02<04:55,  2.27s/it, loss=2.2] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all horse from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all horse from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  74%|███████▍  | 371/500 [14:04<04:54,  2.28s/it, loss=3.92]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment white toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment white toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  74%|███████▍  | 372/500 [14:07<04:51,  2.28s/it, loss=2.21]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all gray zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all gray zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  75%|███████▍  | 373/500 [14:09<04:49,  2.28s/it, loss=0.764]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all knife in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all knife in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  75%|███████▍  | 374/500 [14:11<04:46,  2.27s/it, loss=0.475]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment thin zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment thin zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  75%|███████▌  | 375/500 [14:13<04:44,  2.27s/it, loss=3.35] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every teddy bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every teddy bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  75%|███████▌  | 376/500 [14:16<04:41,  2.27s/it, loss=1.8] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment train']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment train']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  75%|███████▌  | 377/500 [14:18<04:39,  2.27s/it, loss=3.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all zebra objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all zebra objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  76%|███████▌  | 378/500 [14:20<04:37,  2.27s/it, loss=1.55]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment brown sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment brown sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  76%|███████▌  | 379/500 [14:22<04:34,  2.27s/it, loss=0.596]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all hot dog objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all hot dog objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  76%|███████▌  | 380/500 [14:25<04:32,  2.27s/it, loss=2.48] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment square person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment square person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  76%|███████▌  | 381/500 [14:27<04:31,  2.28s/it, loss=0.464]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  76%|███████▋  | 382/500 [14:29<04:29,  2.28s/it, loss=3.02] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  77%|███████▋  | 383/500 [14:32<04:26,  2.28s/it, loss=2]   

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all giraffe in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all giraffe in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  77%|███████▋  | 384/500 [14:34<04:24,  2.28s/it, loss=1.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every silver dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every silver dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  77%|███████▋  | 385/500 [14:36<04:21,  2.27s/it, loss=1.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  77%|███████▋  | 386/500 [14:38<04:19,  2.27s/it, loss=2.87]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  77%|███████▋  | 387/500 [14:41<04:16,  2.27s/it, loss=8.93]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bowl in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bowl in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  78%|███████▊  | 388/500 [14:43<04:14,  2.27s/it, loss=0.865]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all fire hydrant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all fire hydrant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  78%|███████▊  | 389/500 [14:45<04:12,  2.27s/it, loss=2.03] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all sink']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all sink']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 

Epoch 2:  78%|███████▊  | 390/500 [14:47<04:10,  2.28s/it, loss=0.23]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every orange zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every orange zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  78%|███████▊  | 391/500 [14:50<04:08,  2.28s/it, loss=3.23]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all truck objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all truck objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  78%|███████▊  | 392/500 [14:52<04:06,  2.28s/it, loss=2]   

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  79%|███████▊  | 393/500 [14:54<04:04,  2.28s/it, loss=2.71]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  79%|███████▉  | 394/500 [14:57<04:01,  2.28s/it, loss=4.24]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  79%|███████▉  | 395/500 [14:59<03:59,  2.28s/it, loss=2.36]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all white car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all white car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  79%|███████▉  | 396/500 [15:01<03:57,  2.28s/it, loss=0.264]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all suitcase in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all suitcase in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  79%|███████▉  | 397/500 [15:03<03:55,  2.28s/it, loss=5.02] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all gray motorcycle objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all gray motorcycle objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  80%|███████▉  | 398/500 [15:06<03:53,  2.29s/it, loss=6.1] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all car from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all car from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  80%|███████▉  | 399/500 [15:08<03:51,  2.29s/it, loss=0.305]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all sink from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all sink from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  80%|████████  | 400/500 [15:10<03:48,  2.28s/it, loss=1.39] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  80%|████████  | 401/500 [15:13<03:47,  2.29s/it, loss=0.117]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all potted plant regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all potted plant regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  80%|████████  | 402/500 [15:15<03:45,  2.30s/it, loss=1.47] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all circular dining table in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all circular dining table in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hyp

Epoch 2:  81%|████████  | 403/500 [15:17<03:42,  2.30s/it, loss=0.202]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  81%|████████  | 404/500 [15:20<03:39,  2.29s/it, loss=1.22] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  81%|████████  | 405/500 [15:22<03:37,  2.29s/it, loss=5.99]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  81%|████████  | 406/500 [15:24<03:34,  2.29s/it, loss=0.107]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bus in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bus in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  81%|████████▏ | 407/500 [15:26<03:32,  2.29s/it, loss=4.12] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all tall airplane in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all tall airplane in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  82%|████████▏ | 408/500 [15:29<03:30,  2.29s/it, loss=1.27]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  82%|████████▏ | 409/500 [15:31<03:28,  2.29s/it, loss=1.78]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bicycle regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bicycle regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  82%|████████▏ | 410/500 [15:33<03:26,  2.29s/it, loss=0.549]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  82%|████████▏ | 411/500 [15:36<03:24,  2.29s/it, loss=0.624]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all handbag objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all handbag objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  82%|████████▏ | 412/500 [15:38<03:21,  2.29s/it, loss=0.889]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  83%|████████▎ | 413/500 [15:40<03:19,  2.29s/it, loss=0.778]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  83%|████████▎ | 414/500 [15:42<03:17,  2.29s/it, loss=0.376]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  83%|████████▎ | 415/500 [15:45<03:14,  2.29s/it, loss=0.383]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment round stop sign']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment round stop sign']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  83%|████████▎ | 416/500 [15:47<03:11,  2.28s/it, loss=2.86] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all train objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all train objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  83%|████████▎ | 417/500 [15:49<03:09,  2.28s/it, loss=3.73]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every spoon']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every spoon']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  84%|████████▎ | 418/500 [15:52<03:06,  2.28s/it, loss=0.268]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment parking meter']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment parking meter']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  84%|████████▍ | 419/500 [15:54<03:04,  2.28s/it, loss=1.98] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment cow']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment cow']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  84%|████████▍ | 420/500 [15:56<03:01,  2.27s/it, loss=3.03]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment spoon']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment spoon']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  84%|████████▍ | 421/500 [15:58<03:00,  2.29s/it, loss=0.189]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all blue toilet in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all blue toilet in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  84%|████████▍ | 422/500 [16:01<02:58,  2.29s/it, loss=0.558]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every circular person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every circular person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  85%|████████▍ | 423/500 [16:03<02:55,  2.28s/it, loss=1.4]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  85%|████████▍ | 424/500 [16:05<02:53,  2.28s/it, loss=3.15]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment tall horse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment tall horse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  85%|████████▌ | 425/500 [16:08<02:51,  2.28s/it, loss=0.98]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all oval cat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all oval cat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  85%|████████▌ | 426/500 [16:10<02:49,  2.29s/it, loss=2.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every clock']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every clock']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  85%|████████▌ | 427/500 [16:12<02:46,  2.28s/it, loss=3.46]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bowl in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bowl in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  86%|████████▌ | 428/500 [16:14<02:43,  2.28s/it, loss=1.84]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all thin book']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all thin book']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided →

Epoch 2:  86%|████████▌ | 429/500 [16:17<02:41,  2.28s/it, loss=1.23]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all round person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all round person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  86%|████████▌ | 430/500 [16:19<02:39,  2.28s/it, loss=2.29]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all sheep regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all sheep regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  86%|████████▌ | 431/500 [16:21<02:37,  2.29s/it, loss=0.0656]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment silver zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment silver zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  86%|████████▋ | 432/500 [16:23<02:35,  2.29s/it, loss=1.23]  

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all bench']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all bench']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  87%|████████▋ | 433/500 [16:26<02:33,  2.28s/it, loss=1.17]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all black sink regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all black sink regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  87%|████████▋ | 434/500 [16:28<02:30,  2.28s/it, loss=1.31]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment yellow person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment yellow person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  87%|████████▋ | 435/500 [16:30<02:28,  2.28s/it, loss=0.966]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all airplane regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all airplane regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  87%|████████▋ | 436/500 [16:33<02:25,  2.27s/it, loss=1.06] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every gray cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every gray cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  87%|████████▋ | 437/500 [16:35<02:23,  2.27s/it, loss=1.39]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all cat']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all cat']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  88%|████████▊ | 438/500 [16:37<02:21,  2.28s/it, loss=0.995]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all giraffe objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all giraffe objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  88%|████████▊ | 439/500 [16:39<02:18,  2.28s/it, loss=1.68] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment zebra']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment zebra']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  88%|████████▊ | 440/500 [16:42<02:16,  2.27s/it, loss=2.19]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all train in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all train in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  88%|████████▊ | 441/500 [16:44<02:14,  2.28s/it, loss=2.44]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment sheep']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment sheep']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  88%|████████▊ | 442/500 [16:46<02:12,  2.28s/it, loss=1.12]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all pink potted plant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all pink potted plant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  89%|████████▊ | 443/500 [16:49<02:09,  2.28s/it, loss=0.259]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all potted plant in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all potted plant in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  89%|████████▉ | 444/500 [16:51<02:07,  2.28s/it, loss=0.0902]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all giraffe in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all giraffe in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  89%|████████▉ | 445/500 [16:53<02:05,  2.28s/it, loss=0.466] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  89%|████████▉ | 446/500 [16:55<02:03,  2.28s/it, loss=0.145]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment wine glass']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment wine glass']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  89%|████████▉ | 447/500 [16:58<02:00,  2.28s/it, loss=1.52] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all gray person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all gray person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks

Epoch 2:  90%|████████▉ | 448/500 [17:00<01:58,  2.28s/it, loss=5.01]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all green person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all green person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  90%|████████▉ | 449/500 [17:02<01:56,  2.28s/it, loss=7.87]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all train in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all train in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  90%|█████████ | 450/500 [17:04<01:53,  2.28s/it, loss=3.41]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all purple person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all purple person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  90%|█████████ | 451/500 [17:07<01:51,  2.28s/it, loss=0.561]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all oval umbrella objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all oval umbrella objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  90%|█████████ | 452/500 [17:09<01:49,  2.28s/it, loss=1.19] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  91%|█████████ | 453/500 [17:11<01:47,  2.28s/it, loss=3.61]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all cow objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all cow objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  91%|█████████ | 454/500 [17:14<01:44,  2.28s/it, loss=0.386]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all square toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all square toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  91%|█████████ | 455/500 [17:16<01:42,  2.28s/it, loss=0.116]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment airplane']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment airplane']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.l

Epoch 2:  91%|█████████ | 456/500 [17:18<01:40,  2.28s/it, loss=4.92] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all cow regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all cow regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  91%|█████████▏| 457/500 [17:20<01:38,  2.28s/it, loss=1.44]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all tall dog objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all tall dog objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  92%|█████████▏| 458/500 [17:23<01:35,  2.28s/it, loss=0.314]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cat from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cat from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  92%|█████████▏| 459/500 [17:25<01:33,  2.28s/it, loss=2.03] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all circular sink objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all circular sink objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  92%|█████████▏| 460/500 [17:27<01:31,  2.28s/it, loss=0.316]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  92%|█████████▏| 461/500 [17:30<01:29,  2.28s/it, loss=0.375]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all small person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all small person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetwor

Epoch 2:  92%|█████████▏| 462/500 [17:32<01:26,  2.29s/it, loss=0.139]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every triangular potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every triangular potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  93%|█████████▎| 463/500 [17:34<01:24,  2.28s/it, loss=5.24] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all bird regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all bird regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A 

Epoch 2:  93%|█████████▎| 464/500 [17:36<01:22,  2.28s/it, loss=1.14]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all person from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all person from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  93%|█████████▎| 465/500 [17:39<01:19,  2.28s/it, loss=0.729]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  93%|█████████▎| 466/500 [17:41<01:17,  2.28s/it, loss=3]    

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  93%|█████████▎| 467/500 [17:43<01:15,  2.28s/it, loss=6.67]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all elephant from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all elephant from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.

Epoch 2:  94%|█████████▎| 468/500 [17:46<01:12,  2.28s/it, loss=1.87]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all zebra objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all zebra objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  94%|█████████▍| 469/500 [17:48<01:10,  2.28s/it, loss=1.43]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment silver person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment silver person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  94%|█████████▍| 470/500 [17:50<01:08,  2.28s/it, loss=2.67]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all oval banana in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all oval banana in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  94%|█████████▍| 471/500 [17:52<01:06,  2.29s/it, loss=3.13]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  94%|█████████▍| 472/500 [17:55<01:04,  2.29s/it, loss=1.55]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment tall traffic light']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment tall traffic light']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_m

Epoch 2:  95%|█████████▍| 473/500 [17:57<01:01,  2.29s/it, loss=0.146]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all blue person in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all blue person in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  95%|█████████▍| 474/500 [17:59<00:59,  2.29s/it, loss=1.39] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256, 4)

Epoch 2:  95%|█████████▌| 475/500 [18:02<00:57,  2.29s/it, loss=3.24]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment potted plant']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment potted plant']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  95%|█████████▌| 476/500 [18:04<00:54,  2.28s/it, loss=4.08]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all large cow in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all large cow in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  95%|█████████▌| 477/500 [18:06<00:52,  2.29s/it, loss=2.97]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every dog']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every dog']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  96%|█████████▌| 478/500 [18:08<00:50,  2.29s/it, loss=0.121]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every tie']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every tie']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 256

Epoch 2:  96%|█████████▌| 479/500 [18:11<00:47,  2.29s/it, loss=0.204]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all suitcase']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all suitcase']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (

Epoch 2:  96%|█████████▌| 480/500 [18:13<00:45,  2.28s/it, loss=3.29] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  96%|█████████▌| 481/500 [18:15<00:43,  2.29s/it, loss=1.15]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment round bird']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment round bird']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2:  96%|█████████▋| 482/500 [18:18<00:41,  2.29s/it, loss=0.639]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all giraffe from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all giraffe from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.

Epoch 2:  97%|█████████▋| 483/500 [18:20<00:38,  2.29s/it, loss=2.97] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all bottle in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all bottle in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  97%|█████████▋| 484/500 [18:22<00:36,  2.29s/it, loss=0.94]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['extract all cow from the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['extract all cow from the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2

Epoch 2:  97%|█████████▋| 485/500 [18:24<00:34,  2.28s/it, loss=3.33]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['detect and segment car']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['detect and segment car']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided

Epoch 2:  97%|█████████▋| 486/500 [18:27<00:31,  2.28s/it, loss=3.16]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all person']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all person']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  97%|█████████▋| 487/500 [18:29<00:29,  2.28s/it, loss=2.8] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment any object']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment any object']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1, 2

Epoch 2:  98%|█████████▊| 488/500 [18:31<00:27,  2.29s/it, loss=0.17]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 1, 2])
[DEBUG] point_labels shape: torch.Size([1, 1])
[DEBUG] sparse_embeddings shape: torch.Size([1, 2, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment bear']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment bear']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2:  98%|█████████▊| 489/500 [18:34<00:25,  2.29s/it, loss=1.09]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all person regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all person regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lor

Epoch 2:  98%|█████████▊| 490/500 [18:36<00:22,  2.29s/it, loss=2.05]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment black banana']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment black banana']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.la

Epoch 2:  98%|█████████▊| 491/500 [18:38<00:20,  2.29s/it, loss=0.515]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['identify and segment oval toilet']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['identify and segment oval toilet']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.laye

Epoch 2:  98%|█████████▊| 492/500 [18:40<00:18,  2.29s/it, loss=2.32] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all black suitcase regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all black suitcase regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlp

Epoch 2:  99%|█████████▊| 493/500 [18:43<00:16,  2.29s/it, loss=1.44]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all horse objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all horse objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A prov

Epoch 2:  99%|█████████▉| 494/500 [18:45<00:13,  2.29s/it, loss=0.396]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment every mouse']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment every mouse']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  99%|█████████▉| 495/500 [18:47<00:11,  2.28s/it, loss=0.095]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['highlight all zebra regions']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['highlight all zebra regions']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_

Epoch 2:  99%|█████████▉| 496/500 [18:50<00:09,  2.28s/it, loss=2.07] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['segment all giraffe']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['segment all giraffe']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A provided → (1,

Epoch 2:  99%|█████████▉| 497/500 [18:52<00:06,  2.28s/it, loss=1.16]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2: 100%|█████████▉| 498/500 [18:54<00:04,  2.28s/it, loss=1.85]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['find all brown car in the image']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['find all brown car in the image']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers

Epoch 2: 100%|█████████▉| 499/500 [18:56<00:02,  2.28s/it, loss=0.709]

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
[DEBUG] raw task_description type: <class 'list'>, value: ['locate all person objects']
[DEBUG] converted task_descriptions type: <class 'list'> value: ['locate all person objects']
LoRA weights from hypernetwork:
  • output_hypernetworks_mlps.0.layers.0.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.0.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_A provided → (1, 256, 4)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.1.lora_B provided → (1, 4, 256)  (numel=1024)
  • output_hypernetworks_mlps.0.layers.2.lora_A pr

Epoch 2: 100%|██████████| 500/500 [18:59<00:00,  2.28s/it, loss=1.48] 

[DEBUG] image_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] point_coords shape: torch.Size([1, 5, 2])
[DEBUG] point_labels shape: torch.Size([1, 5])
[DEBUG] sparse_embeddings shape: torch.Size([1, 6, 256])
[DEBUG] dense_embeddings shape: torch.Size([1, 256, 64, 64])
[DEBUG] image_pe shape: torch.Size([1, 256, 64, 64])
[DEBUG] dense_embeddings final shape: torch.Size([1, 256, 64, 64])
Epoch 2/2, Average Loss: 1.7307





 ✔️Saved checkpoint to /mnt/data/checkpoints/checkpoint_epoch_1.pth
Training completed!


In [25]:
from IPython.display import HTML, FileLink

display(HTML('<a href="checkpoints.zip" download>⬇️ Download checkpoints.zip</a>'))

#link to the SAM backbone checkpoint
display(HTML('<a href="sam_vit_h_4b8939.pth" download>⬇️ Download sam_vit_h_4b8939.pth</a>'))

#alternative
print("Or use FileLink widgets below:")
display(FileLink('checkpoints.zip'))
display(FileLink('sam_vit_h_4b8939.pth'))



Or use FileLink widgets below:
