In [None]:

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForImageTextRetrieval

processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-large-coco")
model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-large-coco")


In [None]:
from peft import get_peft_model, LoraConfig, TaskType
lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    lora_dropout=0.01,
    target_modules=["qkv", "query", "key", "value", "dense", "projection", "fc1", "fc2", "text_proj", "visual_proj", "position_embeddings"],
)
model = get_peft_model(model, lora_config)  

In [None]:
model.print_trainable_parameters()

In [None]:
processor(images=Image.new('RGB', (94, 34)), text="A cat", return_tensors="pt").pixel_values.shape

In [None]:

model = model.cuda()
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
train_ds = load_dataset("zai-org/VisionRewardDB-Image", split='train[:40000]')
test_ds = load_dataset("zai-org/VisionRewardDB-Image", split='train[40000:]')

import io, math, random
import numpy as np
from PIL import Image, ImageFilter, ImageEnhance
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd

df = pd.read_csv("rules.csv")

import pandas as pd
import re

df.columns = df.columns.str.strip()
df['Dimension'] = df['Dimension'].ffill()

df['dim_key'] = df['Dimension'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else x)

guide = {
    dim_key: {
        int(row['Score']): row['Option'] + ": " +str(row['Description']).strip()
        for _, row in group.iterrows()
    }
    for dim_key, group in df.groupby('dim_key')
}

dims = {k: v for k, v in guide.items() if k not in ["unsafe type", "hands", "face", "body", "safety", "lighting aesthetic", "symmetry"]}.keys()
dims = list(dims)
dim_min = {i:min(guide[i].keys()) for i in guide.keys()}

# %%

import json
with open("prompts.json", "r") as f:
    prompt_dict = json.load(f)



In [None]:

def format_data(sample):
    images = []
    dims_selected = []
    # print(len(sample["image"]), len(sample["annotation"]))
    for image in range(len(sample['image'])):
        images.append(sample['image'][image])
        try:
            if random.random()>0.5:
                # sample a dim with score>=0 
                dims_selected.append(random.choice(list([i for i in dims if sample['annotation'][image][i]>=0])))
            else:
                # sample a dim with score<0
                dims_selected.append(random.choice(list([i for i in dims if sample['annotation'][image][i]<0])))
        except IndexError:
            dims_selected.append(random.choice(dims))
            

    prompts = [prompt_dict[dim] for i, dim in enumerate(dims_selected)]
    images = list(sample['image'])
    n_images = len(images)
    n_prompts = len(prompts) 
    inputs = processor(images=images, text=prompts, return_tensors="pt", padding=True)
    answers = [1 if i[dim]<0 else (0.5 if i[dim]==0 else 0) for i, dim in zip(sample["annotation"], dims_selected)]
    labels = torch.tensor(answers)
    inputs['labels'] = labels
    inputs['dim'] = [dims.index(dim) for dim in dims_selected]
    inputs['n_images'] = [n_images] * len(inputs['input_ids'])
    return {
        'pixel_values': inputs['pixel_values'],
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': inputs['labels'], 
        'dims': dims_selected,
        'n_images': inputs['n_images'],
        # "annotation": [i[dim] for i, dim in zip(sample["annotation"], dims_selected)],
    } 



# %%
train_ds = train_ds.with_transform(format_data)
test_ds = test_ds.with_transform(format_data)

In [None]:
# train_ds[0:7]["labels"]

In [None]:

import wandb

import torch
from transformers import PreTrainedModel, PretrainedConfig

class Rater(PreTrainedModel):
    def __init__(self, backbone):
      super().__init__(PretrainedConfig())
      self.backbone = backbone
      self.head = torch.nn.Linear(768, 1)

    def forward(self, pixel_values, input_ids, attention_mask, n_images, labels=None):
      n_images = n_images[0]
      outputs = self.backbone(pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask)
      itm_scores = self.head(outputs.question_embeds[:,0,:]).squeeze(-1)

      if labels is not None:
        assert itm_scores.shape == labels.shape, f"{itm_scores.shape} {labels.shape}"
        assert itm_scores.shape[0] == n_images
        bce_loss = torch.nn.functional.binary_cross_entropy_with_logits(itm_scores, labels)
        mae_loss = torch.nn.functional.l1_loss(torch.sigmoid(itm_scores), labels)
        loss = bce_loss + mae_loss

        try: 
          wandb.log({"bce_loss": bce_loss, "acc": ((itm_scores>0) == (labels>0.5)).float().mean(), "mae_loss": mae_loss})
        except:
          pass
        outputs['loss'] = loss

      return outputs

my_rater = Rater(model)




In [None]:
# my_rater = my_rater.cpu()
# with torch.no_grad():
#     my_rater(**train_ds[0:2]) 

In [None]:

from transformers import TrainingArguments
import os
training_args = TrainingArguments(
    output_dir="BLIP-Reward-Long",
    learning_rate=3e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    num_train_epochs=20,
    weight_decay=0.001,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=1,
    load_best_model_at_end=True,
    push_to_hub=True,
    max_grad_norm=1.0,
    remove_unused_columns=False,
    dataloader_num_workers=min(os.cpu_count(), 16),
    fp16=True,
    warmup_ratio=0.01,
    lr_scheduler_type="cosine",
    # lr_scheduler_kwargs={"num_decay_steps": 500},
    report_to="wandb"
)


from peft import get_peft_model, LoraConfig, TaskType
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["qkv", "query", "key", "value", "dense", "projection"],
    modules_to_save=["head"] 
)
my_rater = get_peft_model(my_rater, lora_config)  
my_rater = my_rater.to("cuda")
my_rater.print_trainable_parameters()
from transformers import Trainer 

trainer = Trainer(
    model=my_rater,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    processing_class=processor,
)

# %%
trainer.train() 


In [None]:
from datasets import load_dataset
dataset = load_dataset("weathon/aas_benchmark", split="train")

In [None]:
%pip install wordcloud

In [2]:

from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
# train_ds = load_dataset("zai-org/VisionRewardDB-Image", split='train[:40000]')
# test_ds = load_dataset("zai-org/VisionRewardDB-Image", split='train[40000:]')

import io, math, random
import numpy as np
from PIL import Image, ImageFilter, ImageEnhance
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd

df = pd.read_csv("rules.csv")

import pandas as pd
import re

df.columns = df.columns.str.strip()
df['Dimension'] = df['Dimension'].ffill()

df['dim_key'] = df['Dimension'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else x)

guide = {
    dim_key: {
        int(row['Score']): row['Option'] + ": " +str(row['Description']).strip()
        for _, row in group.iterrows()
    }
    for dim_key, group in df.groupby('dim_key')
}

dims = {k: v for k, v in guide.items() if k not in ["unsafe type", "hands", "face", "body", "safety", "lighting aesthetic", "symmetry"]}.keys()
dims = list(dims)
dim_min = {i:min(guide[i].keys()) for i in guide.keys()}

# %%

import json
with open("prompts.json", "r") as f:
    prompt_dict = json.load(f)



In [3]:
dims

['background',
 'clarity',
 'color aesthetic',
 'color brightness',
 'detail realism',
 'detail refinement',
 'emotion',
 'lighting distinction',
 'main object',
 'object pairing',
 'richness']

In [None]:
all_prompts = " ".join(list(dataset['prompt_original']))
all_distorted_prompts = " ".join(list(dataset['prompt_distorted']))

In [None]:
import re
from wordcloud import STOPWORDS

text = all_distorted_prompts

text = re.sub(r'[^A-Za-z\s]', '', text)

text = text.lower()

stopwords = set(STOPWORDS)
text = ' '.join(word for word in text.split() if word not in stopwords)

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
 
wordcloud = WordCloud(width=256, height=412, background_color='white').generate(text)
Image.fromarray(np.array(wordcloud))

In [None]:
index_of_interest = {}
for dim in guide.keys():
    if dim not in ["unsafe type", "hands", "face", "body", "safety", "lighting aesthetic", "symmetry"]:
        index_of_interest[dim] = dims.index(dim)

In [None]:
def get_delta_rater(sample):
    original = np.array(sample["rater"]["original"]["scores"])
    distorted = np.array(sample["rater"]["distorted"]["scores"])
    sample['delta_rater'] = sum(distorted - original)
    sample['rater_dp'] = sum(distorted)
    return sample

In [None]:
def get_hpsv3_reward(sample):
    original = sample["hpsv3_reward"]["hpsv3_oidp"][0]
    distorted = sample["hpsv3_reward"]["hpsv3_didp"][0]
    sample['delta_hpsv3_reward'] = distorted - original
    sample['hpsv3_reward_dp'] = distorted
    return sample

In [None]:
def get_rater_selected(sample):
    dims = json.loads(sample["selected_dims"])
    idx = np.array([index_of_interest[dim] for dim in dims])
    original = np.array(sample["rater"]["original"]["scores"])
    original = np.take(original, idx)
    distorted = np.array(sample["rater"]["distorted"]["scores"])
    distorted = np.take(distorted, idx)
    sample['delta_rater_selected'] = sum(distorted - original)
    return sample

In [None]:
# dataset = dataset.remove_columns(['image_original', 'image_distorted'])

In [None]:
dataset = dataset.map(get_delta_rater)
dataset = dataset.map(get_hpsv3_reward)
dataset = dataset.map(get_rater_selected)

In [None]:
import numpy as np
import pylab
from scipy import stats
x = dataset['delta_rater_selected']
y = dataset['delta_hpsv3_reward']
df = pd.DataFrame({'x': x, 'y': y}) 
# df = df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
pylab.scatter(df['x'], df['y'])

In [None]:
# idx = 30
# idx = np.where((df['x'] < 0) & (df['y'] > 0))[0][idx]
# print(dataset[idx]["hpsv3_reward"]["hpsv3_oidp"][0], dataset[idx]["hpsv3_reward"]["hpsv3_didp"][0], dataset[idx]["delta_hpsv3_reward"])
# img_original = dataset[idx]["image_original"].convert("RGB")
# img_distorted = dataset[idx]["image_distorted"].convert("RGB")
# new_img = Image.new("RGB", (img_original.width + img_distorted.width, img_original.height))
# new_img.paste(img_original, (0, 0))
# new_img.paste(img_distorted, (img_original.width, 0))
# display(new_img)


In [None]:
from scipy.stats import pearsonr, spearmanr
pearson_corr, _ = pearsonr(x, y)
spearman_corr, _ = spearmanr(x, y)
print(f"Pearson correlation: {pearson_corr}")
print(f"Spearman correlation: {spearman_corr}") 

In [None]:
p

In [None]:
# Kendall tau
from scipy.stats import kendalltau
kendall_corr, _ = kendalltau(np.array(dataset['delta_rater']).mean(-1), dataset['hpsv3_reward'])
print(f"Kendall tau correlation: {kendall_corr}")

In [None]:
success_index = np.where(np.array(dataset['delta_rater']) < 0)[0]
failed_index = np.where(np.array(dataset['delta_rater']) >= 0)[0]

In [None]:
pylab.hist(np.array(dataset['hpsv3_reward'])[success_index], alpha=0.5, label='successful cases')
pylab.hist(np.array(dataset['hpsv3_reward'])[failed_index], alpha=0.5, label='failed cases') 
pylab.legend()

In [None]:
X = np.array(dataset['delta_rater'])
Y = np.array(dataset['hpsv3_reward'])

In [None]:
X.shape, Y.shape 

In [None]:
from openai import OpenAI
import dotenv
dotenv.load_dotenv()

In [None]:
import os
key = os.environ["DEEPNFRA_API_KEY"]

openai = OpenAI(
    api_key=key,
    base_url="https://api.deepinfra.com/v1/openai",
)


completion = openai.chat.completions.create(
  model="Qwen/Qwen3-VL-30B-A3B-Thinking",
  messages=[
    {
      "role": "user",
      "content": [
        {
          "type": "text",
          "text": "What is in this image?"
        },
        {
          "type": "image_url",
          "image_url": {
            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
          }
        }
      ]
    }
  ]
)
print(completion.choices[0].message.content)

In [None]:
from datasets import load_dataset
dataset = load_dataset("weathon/aas_benchmark", split="train")

In [None]:
dataset["rater"]

In [None]:
# idxs = 10, 105

In [None]:
# 10 107 980

In [None]:
import pylab
idx = 980
print("llm_selected", dataset["llm_selected"][idx])
print("HPSv2 selected", int(dataset[idx]["hpsv2_reward"]["hpsv2_didp"] > dataset[idx]["hpsv2_reward"]["hpsv2_oidp"]))
print("HPSv3 selected", int(dataset[idx]["hpsv3_reward"]["hpsv3_didp"][0] > dataset[idx]["hpsv3_reward"]["hpsv3_oidp"][0]))
print("image_reward selected", int(dataset[idx]["image_reward"]["image_reward_didp"] > dataset[idx]["image_reward"]["image_reward_oidp"]))
# print("rater_selected", int(sum(dataset[idx]["rater"]["distorted"]["scores"]) < sum(dataset[idx]["rater"]["original"]["scores"])))
pylab.subplot(1, 2, 1)
pylab.imshow(dataset["image_original"][idx])
pylab.axis("off")
pylab.subplot(1, 2, 2)
pylab.imshow(dataset["image_distorted"][idx])
pylab.axis("off")
pylab.tight_layout()

In [None]:
blip_score = [i.tolist() for i in blip_score]

In [None]:
dataset.push_to_hub("weathon/aas_benchmark", private=True)

In [None]:
idxs = [10, 980, 109, 208, 906, 394]

In [None]:
import numpy as np
np.where(np.array(dataset["llm_selected"])!=1)

In [None]:
dataset

In [None]:
import pylab
import json
idx = 27
print("llm_selected", dataset["llm_selected"][idx])
print("HPSv2 selected", int(dataset[idx]["hpsv2_reward"]["hpsv2_didp"] > dataset[idx]["hpsv2_reward"]["hpsv2_oidp"]))
print("HPSv3 selected", int(dataset[idx]["hpsv3_reward"]["hpsv3_didp"][0] > dataset[idx]["hpsv3_reward"]["hpsv3_oidp"][0]))
print("image_reward selected", int(dataset[idx]["image_reward"]["image_reward_didp"] > dataset[idx]["image_reward"]["image_reward_oidp"]))
print("blip selected", dataset["blip_selected"][idx]) 
print("Effect", ", ".join(json.loads(dataset["selected_dims"][idx])))
print("Effect", dataset["prompt_distorted"][idx])
print("Effect", dataset["selected_dims"][idx])
# print("rater_selected", int(sum(dataset[idx]["rater"]["distorted"]["scores"]) < sum(dataset[idx]["rater"]["original"]["scores"])))
pylab.subplot(1, 2, 1)
pylab.imshow(dataset["image_original"][idx])
pylab.axis("off")
pylab.subplot(1, 2, 2)
pylab.imshow(dataset["image_distorted"][idx])
pylab.axis("off")
pylab.tight_layout()

In [None]:
dataset_no_image = dataset.remove_columns(['image_original', 'image_distorted'])

In [None]:
hpsv3_selected = []
for i in tqdm.tqdm(range(len(dataset_no_image))):
    hpsv3_selected.append(int(dataset_no_image[i]["hpsv3_reward"]["hpsv3_didp"][0] > dataset_no_image[i]["hpsv3_reward"]["hpsv3_oidp"][0]))

hpsv2_selected = []
for i in tqdm.tqdm(range(len(dataset_no_image))):
    hpsv2_selected.append(int(dataset_no_image[i]["hpsv2_reward"]["hpsv2_didp"] > dataset_no_image[i]["hpsv2_reward"]["hpsv2_oidp"]))

image_reward_selected = []
for i in tqdm.tqdm(range(len(dataset_no_image))):
    image_reward_selected.append(int(dataset_no_image[i]["image_reward"]["image_reward_didp"] > dataset_no_image[i]["image_reward"]["image_reward_oidp"]))

In [None]:
# remove tie

hpsv3_selected = np.array(hpsv3_selected)
hpsv2_selected = np.array(hpsv2_selected)
image_reward_selected = np.array(image_reward_selected)

filtered_indices = np.array(dataset_no_image["llm_selected"]) != -1
hpsv3_selected = hpsv3_selected[filtered_indices]
hpsv2_selected = hpsv2_selected[filtered_indices]
image_reward_selected = image_reward_selected[filtered_indices]
llm_selected = np.array(dataset_no_image["llm_selected"])[filtered_indices]

In [None]:
# kappa
from sklearn.metrics import cohen_kappa_score
kappa_hpsv3 = cohen_kappa_score(llm_selected, hpsv3_selected)
kappa_hpsv3 

In [None]:
# F1 
from sklearn.metrics import f1_score
f1_hpsv3 = f1_score(llm_selected, hpsv3_selected, average='micro')
f1_hpsv3 

In [None]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(llm_selected, image_reward_selected)

In [None]:
np.unique(dataset["model"])

In [None]:
dataset_no_image = dataset.remove_columns(['image_original', 'image_distorted'])

In [None]:
sorted(df["model"])

In [None]:
ds = dataset
counter = {}
models = []
for idx, m in zip(ds["index"], ds["model"]):
    if m == "sd3_medium_grpo":
        key = (idx, m)
        counter[key] = counter.get(key, 0) + 1
        if counter[key] == 1:
            models.append(m)
        else:
            models.append(m + "_" + str(counter[key]))
    else:
        models.append(m)
ds = ds.remove_columns("model").add_column("model", models)

In [None]:
np.unique(ds["model"])

In [None]:
ds.push_to_hub("weathon/aas_benchmark_2", private=True) 

In [None]:
dataset=ds

In [None]:
sum(np.array(dataset["model"])=="sd3_medium_grpo")

In [None]:
orders = ['flux_dev', 'dance_flux', 'grpo_flux', 'flux_krea', 'stable_diffusion_xl',
       'playground', 'stable_diffusion_3.5_medium', 'sd3_medium_grpo', 'sd3_medium_grpo_geneval', 'nano-banana']

In [None]:
name_mapping = {
    'flux_dev': "Flux Dev",
    'dance_flux': "Dance Flux", 
    'grpo_flux': "PrefFlux",
    'flux_krea': "Flux Krea",
    'stable_diffusion_xl': "SDXL",
    'playground': "Playground",
    'stable_diffusion_3.5_medium': "SD3.5M",
    'sd3_medium_grpo': "SD3.5M-PickScore",
    'sd3_medium_grpo_geneval': "SD3.5M-GenEval",
    'nano-banana': "Nano Banana",
}

In [None]:
# 10, 30

In [None]:
import json

relative_idx_list = [10, 170, 78, 48]

for relative_idx in relative_idx_list:
    slice = np.where(np.array(dataset_no_image["index"]) == relative_idx)[0]
    df = dataset[slice]
    n = len(df["model"]) - 1
    text_str = ", ".join([s.title() for s in json.loads(df["selected_dims"][0])])
    # text_str = df["distorted_prompt"][0]

    pylab.figure(figsize=(28, 4))
    for i in range(n+1):
        if df["model"][i] not in orders:
            continue
        ax = pylab.subplot(1, n, orders.index(df["model"][i]) + 1)
        pylab.imshow(dataset["image_distorted"][slice[i]])
        pylab.axis("off")
        ax.set_title(name_mapping[df["model"][i]], fontsize=18, pad=10)

    pylab.suptitle(text_str, y=0.02, fontsize=24, ha="center", va="bottom")
    pylab.tight_layout(rect=[0, 0.05, 1, 1])
    pylab.savefig("qualitative_" + str(relative_idx) + ".pdf")


In [None]:
len(df)

In [None]:
def find_failed_images(sample):
    original_image = np.array(sample["image_original"])
    distorted_image = np.array(sample["image_distorted"])
    if original_image.mean((0,1))[2]>250:
        # the whole image is with color blue
        return True
    if distorted_image.mean((0,1))[2]>250:
        return True

In [None]:
returns = [] 
for i in range(len(ds)): 
    sample = ds[i]
    returns.append(find_failed_images(sample))

In [None]:
np.where(returns) 

In [None]:
from openai import OpenAI
import dotenv
import os
dotenv.load_dotenv()
image_client = OpenAI() 
openrouter_api_key = os.environ["OPENROUTER_API_KEY"]
openrouter_client = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=openrouter_api_key)
llm_client = openrouter_client

In [None]:
import base64
from io import BytesIO

def nano_banana_generate(prompt: str) -> Image.Image:
    last_error = None
    for attempt in range(5):
        try:
            completion = openrouter_client.chat.completions.create(
                model="google/gemini-2.5-flash-image-preview",
                messages=[
                    {
                        "role": "user",
                        "content": [
                            {
                                "type": "text",
                                "text": f"Generate an image with following prompt, return the image directly: {prompt}",
                            },
                        ],
                    }
                ],
            )
            message = completion.choices[0].message
            try:
                image_url = message.images[0]["image_url"]["url"]
                image_bytes = base64.b64decode(image_url.split(",")[1].replace("\x00", ""))
                return Image.open(BytesIO(image_bytes)).convert("RGB")
            except AttributeError:
                text_response = getattr(message, "content", "")
                print(f"nano-banana returned text-only response: {text_response}")
                raise Exception("nano-banana returned text-only response")
        except Exception as exc:
            last_error = exc
            if attempt == 4:
                print(f"nano-banana failed after retries: {exc}")
                return Image.new("RGB", (1024, 1024), color=(0, 0, 255))
            print(f"retrying nano-banana due to error: {exc}")
    return Image.new("RGB", (1024, 1024), color=(0, 0, 255))


In [None]:
dataset.select(np.where(returns)[0].tolist())

In [347]:
import base64

In [None]:

idx = np.where(returns)[0].tolist()
for row_id, sample in zip(idx, dataset.select(idx)):
    prompt_original = sample["prompt_original"]
    prompt_distorted = sample["prompt_distorted"]
    print(prompt_original)
    img = nano_banana_generate(prompt_original)
    img_2 = nano_banana_generate(prompt_distorted)
    dataset[row_id]["image_original"] = img
    dataset[row_id]["image_distorted"] = img_2

In [358]:
dataset[row_id]["image_original"] = img

In [None]:
np.where(returns)[0].tolist()

In [None]:
import hashlib
fingerprint = hashlib.md5(b"add_images_v1").hexdigest()

In [363]:
idx = np.where(returns)[0].tolist()

def add_images(example, i):
    if i in idx:
        example["image_original"] = nano_banana_generate(example["prompt_original"])
        example["image_distorted"] = nano_banana_generate(example["prompt_distorted"])
    return example

dataset = dataset.map(add_images, with_indices=True, num_proc=8, new_fingerprint=fingerprint)



Map (num_proc=8):   0%|          | 0/3300 [00:00<?, ? examples/s]

retrying nano-banana due to error: 'ChatCompletionMessage' object has no attribute 'images'
retrying nano-banana due to error: 'ChatCompletionMessage' object has no attribute 'images'
retrying nano-banana due to error: 'ChatCompletionMessage' object has no attribute 'images'
retrying nano-banana due to error: 'ChatCompletionMessage' object has no attribute 'images'
retrying nano-banana due to error: 'ChatCompletionMessage' object has no attribute 'images'


In [378]:
returns = [] 
for i in range(300):
    sample = ds[i]
    returns.append(find_failed_images(sample))

In [379]:
np.where(returns) 

(array([  8,  28,  45,  54,  77,  84,  91,  92, 102, 121, 124, 125, 143,
        189, 195, 210, 221, 234, 237, 244, 245, 280, 285, 292]),)

In [None]:
dataset.push_to_hub("weathon/aas_benchmark_2", private=True)

In [None]:
# dataset 

Dataset({
    features: ['image_original', 'image_distorted', 'index', 'prompt_original', 'prompt_distorted', 'selected_dims', 'hpsv2_reward', 'hpsv3_reward', 'image_reward', 'rater', 'llm_selected', 'blip_score', 'model'],
    num_rows: 3300
})

In [None]:
# dataset = dataset.remove_columns(['llm_judge', 'blip_selected']) 

In [None]:
# 'hpsv3_reward', 'rater'

In [5]:
idx_of_interest = [  8,  28,  45,  54,  77,  84,  91,  92, 102, 121, 124, 125, 143, 189, 195, 210, 221, 234, 237, 244, 245, 280, 285, 292]

In [None]:
from datasets import load_dataset
dataset = load_dataset("weathon/aas_benchmark_2", split="train") 

README.md:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

data/train-00000-of-00015.parquet:   0%|          | 0.00/731M [00:00<?, ?B/s]

data/train-00001-of-00015.parquet:   0%|          | 0.00/685M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3300 [00:00<?, ? examples/s]

In [None]:
import torch
import requests
from PIL import Image 
from transformers import BlipProcessor, BlipForImageTextRetrieval

processor = BlipProcessor.from_pretrained("Salesforce/blip-itm-large-coco")
model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-large-coco", torch_dtype=torch.float16).to("cuda")

def blip_select(sample, i):
    if i not in idx_of_interest:
        return sample["blip_score"]
    print("Processing index:", i)
    inputs = processor([sample["image_original"], sample["image_distorted"]], [sample["prompt_distorted"]]*2, return_tensors="pt").to("cuda", torch.float16)

    itm_scores = model(**inputs)[0]
    # sample["blip_select"] = itm_scores[:,1].argmax()
    return itm_scores.cpu().detach().numpy().tolist()
    
import tqdm
blip_score = []
for i in tqdm.tqdm(range(len(dataset))):
    blip_score.append(blip_select(dataset[i], i))
dataset = dataset.remove_columns("blip_score").add_column("blip_score", blip_score)

In [391]:
dataset.save_to_disk("aas_benchmark_2_with_blip")

Saving the dataset (0/15 shards):   0%|          | 0/3300 [00:00<?, ? examples/s]

In [3]:
import infer 

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
from datasets import load_from_disk
dataset = load_from_disk("aas_benchmark_2_with_blip")

In [2]:
idx_of_interest = [  8,  28,  45,  54,  77,  84,  91,  92, 102, 121, 124, 125, 143, 189, 195, 210, 221, 234, 237, 244, 245, 280, 285, 292]

In [5]:
rater_results = [] 
import tqdm 
for i, sample in enumerate(tqdm.tqdm(dataset)):
    result = infer.rate_image(sample, i, set(idx_of_interest))
    rater_results.append(result)

  0%|          | 7/3300 [00:00<03:43, 14.72it/s]

background 0.90625 original
clarity 1.046875 original
color aesthetic 0.90625 original
color brightness 0.7421875 original
detail realism 0.77734375 original
detail refinement 1.109375 original
emotion 1.921875 original
lighting distinction 0.2578125 original
main object 2.0 original
object pairing 0.255859375 original
richness 1.4921875 original


  0%|          | 11/3300 [00:05<33:31,  1.64it/s]

background 0.02001953125 distorted
clarity 0.9453125 distorted
color aesthetic 0.470703125 distorted
color brightness 0.65625 distorted
detail realism 0.70703125 distorted
detail refinement 0.5 distorted
emotion 0.8359375 distorted
lighting distinction 0.8671875 distorted
main object 1.1015625 distorted
object pairing 0.2236328125 distorted
richness 1.234375 distorted


  1%|          | 27/3300 [00:06<04:41, 11.61it/s]

background 0.018798828125 original
clarity 1.984375 original
color aesthetic 0.87109375 original
color brightness 0.040771484375 original
detail realism 0.53515625 original
detail refinement 1.5234375 original
emotion 1.9921875 original
lighting distinction 0.3125 original
main object 2.0 original
object pairing 0.6328125 original
richness 1.5 original


  1%|          | 31/3300 [00:10<24:47,  2.20it/s]

background 0.0927734375 distorted
clarity 1.375 distorted
color aesthetic 0.89453125 distorted
color brightness 0.0068359375 distorted
detail realism 0.87109375 distorted
detail refinement 1.125 distorted
emotion 0.91015625 distorted
lighting distinction 0.1142578125 distorted
main object 1.9609375 distorted
object pairing 0.75390625 distorted
richness 1.0234375 distorted


  1%|▏         | 45/3300 [00:10<05:00, 10.85it/s]

background 0.0830078125 original
clarity 1.9453125 original
color aesthetic 0.97265625 original
color brightness 0.004425048828125 original
detail realism 0.6640625 original
detail refinement 1.28125 original
emotion 1.28125 original
lighting distinction 0.67578125 original
main object 2.0 original
object pairing 0.8828125 original
richness 1.8359375 original


  1%|▏         | 49/3300 [00:12<14:34,  3.72it/s]

background 0.65625 distorted
clarity 1.2890625 distorted
color aesthetic 0.65234375 distorted
color brightness 0.6796875 distorted
detail realism 0.0859375 distorted
detail refinement 0.421875 distorted
emotion 0.59765625 distorted
lighting distinction 1.03125 distorted
main object 1.796875 distorted
object pairing 0.5 distorted
richness 1.484375 distorted


  2%|▏         | 53/3300 [00:13<08:53,  6.08it/s]

background 0.44921875 original
clarity 1.796875 original
color aesthetic 0.6875 original
color brightness 0.8125 original
detail realism 0.2080078125 original
detail refinement 1.203125 original
emotion 1.0234375 original
lighting distinction 1.734375 original
main object 1.9921875 original
object pairing 0.70703125 original
richness 1.4375 original


  2%|▏         | 57/3300 [00:15<16:29,  3.28it/s]

background 0.7578125 distorted
clarity 1.0078125 distorted
color aesthetic 0.8671875 distorted
color brightness 0.05810546875 distorted
detail realism 0.80859375 distorted
detail refinement 0.85546875 distorted
emotion 0.8359375 distorted
lighting distinction 0.416015625 distorted
main object 1.9921875 distorted
object pairing 0.02783203125 distorted
richness 1.3359375 distorted


  2%|▏         | 77/3300 [00:16<03:37, 14.79it/s]

background 0.076171875 original
clarity 1.6015625 original
color aesthetic 1.6875 original
color brightness 0.0634765625 original
detail realism 0.1494140625 original
detail refinement 1.5546875 original
emotion 1.46875 original
lighting distinction 0.6796875 original
main object 2.0 original
object pairing 0.0673828125 original
richness 1.7109375 original


  2%|▏         | 79/3300 [00:19<27:36,  1.94it/s]

background 0.0096435546875 distorted
clarity 0.921875 distorted
color aesthetic 0.73046875 distorted
color brightness 0.10693359375 distorted
detail realism 0.2021484375 distorted
detail refinement 0.5546875 distorted
emotion 0.86328125 distorted
lighting distinction 1.0625 distorted
main object 1.9921875 distorted
object pairing 0.69921875 distorted
richness 1.0390625 distorted


  3%|▎         | 83/3300 [00:19<15:09,  3.54it/s]

background 0.0289306640625 original
clarity 1.0 original
color aesthetic 1.9140625 original
color brightness 0.1708984375 original
detail realism 0.10791015625 original
detail refinement 1.25 original
emotion 1.921875 original
lighting distinction 1.59375 original
main object 1.9921875 original
object pairing 0.181640625 original
richness 1.2890625 original


  3%|▎         | 87/3300 [00:23<29:47,  1.80it/s]

background 0.53125 distorted
clarity 0.7890625 distorted
color aesthetic 0.22265625 distorted
color brightness 0.00469970703125 distorted
detail realism 0.1845703125 distorted
detail refinement 0.609375 distorted
emotion 0.56640625 distorted
lighting distinction 0.013671875 distorted
main object 1.7890625 distorted
object pairing 0.77734375 distorted
richness 0.90625 distorted


  3%|▎         | 91/3300 [00:24<16:15,  3.29it/s]

background 0.2060546875 original
clarity 1.859375 original
color aesthetic 0.9921875 original
color brightness 0.8984375 original
detail realism 0.828125 original
detail refinement 1.2421875 original
emotion 0.53125 original
lighting distinction 0.6171875 original
main object 1.421875 original
object pairing 0.1630859375 original
richness 1.8359375 original
background 0.984375 distorted
clarity 0.9609375 distorted
color aesthetic 0.26953125 distorted
color brightness 0.78515625 distorted
detail realism 0.875 distorted
detail refinement 0.41015625 distorted
emotion 0.671875 distorted
lighting distinction 0.2353515625 distorted
main object 0.7109375 distorted
object pairing 0.70703125 distorted
richness 1.34375 distorted
background 0.068359375 original
clarity 1.984375 original
color aesthetic 1.7578125 original
color brightness 0.4453125 original
detail realism 0.439453125 original
detail refinement 1.6484375 original
emotion 1.9609375 original
lighting distinction 0.5390625 original
ma

  3%|▎         | 95/3300 [00:31<52:18,  1.02it/s]  

background 0.89453125 distorted
clarity 1.9375 distorted
color aesthetic 0.7578125 distorted
color brightness 0.6328125 distorted
detail realism 0.0233154296875 distorted
detail refinement 0.82421875 distorted
emotion 0.9921875 distorted
lighting distinction 0.90625 distorted
main object 2.0 distorted
object pairing 0.89453125 distorted
richness 1.0546875 distorted


  3%|▎         | 101/3300 [00:32<20:04,  2.66it/s]

background 0.1171875 original
clarity 1.9765625 original
color aesthetic 0.91796875 original
color brightness 0.4296875 original
detail realism 0.44921875 original
detail refinement 1.3046875 original
emotion 0.66796875 original
lighting distinction 0.98828125 original
main object 1.9921875 original
object pairing 0.11962890625 original
richness 1.4921875 original


  3%|▎         | 105/3300 [00:35<27:23,  1.94it/s]

background 0.59375 distorted
clarity 0.43359375 distorted
color aesthetic 0.2451171875 distorted
color brightness 0.921875 distorted
detail realism 0.10791015625 distorted
detail refinement 0.1572265625 distorted
emotion 0.69140625 distorted
lighting distinction 0.98828125 distorted
main object 0.9453125 distorted
object pairing 0.408203125 distorted
richness 1.234375 distorted


  4%|▎         | 121/3300 [00:36<04:32, 11.67it/s]

background 0.47265625 original
clarity 1.1328125 original
color aesthetic 0.9609375 original
color brightness 0.00994873046875 original
detail realism 0.921875 original
detail refinement 0.7578125 original
emotion 1.7265625 original
lighting distinction 0.0927734375 original
main object 1.96875 original
object pairing 0.1953125 original
richness 1.8515625 original


  4%|▎         | 123/3300 [00:38<18:59,  2.79it/s]

background 0.228515625 distorted
clarity 1.046875 distorted
color aesthetic 0.8671875 distorted
color brightness 0.2080078125 distorted
detail realism 0.890625 distorted
detail refinement 0.7890625 distorted
emotion 0.61328125 distorted
lighting distinction 0.59375 distorted
main object 0.734375 distorted
object pairing 0.0712890625 distorted
richness 1.109375 distorted
background 0.0927734375 original
clarity 1.9609375 original
color aesthetic 0.9765625 original
color brightness 0.0272216796875 original
detail realism 0.828125 original
detail refinement 1.4296875 original
emotion 1.8125 original
lighting distinction 0.341796875 original
main object 1.9921875 original
object pairing 0.70703125 original
richness 1.734375 original


  4%|▍         | 125/3300 [00:42<45:23,  1.17it/s]

background 0.48046875 distorted
clarity 0.8828125 distorted
color aesthetic 0.73046875 distorted
color brightness 0.31640625 distorted
detail realism 0.796875 distorted
detail refinement 0.80859375 distorted
emotion 0.80078125 distorted
lighting distinction 0.2734375 distorted
main object 1.765625 distorted
object pairing 0.34765625 distorted
richness 1.40625 distorted
background 0.033203125 original
clarity 1.9375 original
color aesthetic 1.921875 original
color brightness 0.0673828125 original
detail realism 0.5703125 original
detail refinement 1.3046875 original
emotion 1.75 original
lighting distinction 0.2255859375 original
main object 2.0 original
object pairing 0.34375 original
richness 1.8359375 original


  4%|▍         | 128/3300 [00:46<49:15,  1.07it/s]  

background 0.0849609375 distorted
clarity 0.1103515625 distorted
color aesthetic 0.46875 distorted
color brightness 0.3203125 distorted
detail realism 0.578125 distorted
detail refinement 0.1484375 distorted
emotion 0.6640625 distorted
lighting distinction 0.81640625 distorted
main object 0.1650390625 distorted
object pairing 0.73046875 distorted
richness 1.25 distorted


  4%|▍         | 142/3300 [00:47<06:29,  8.10it/s]

background 0.115234375 original
clarity 1.9453125 original
color aesthetic 1.859375 original
color brightness 0.130859375 original
detail realism 0.5703125 original
detail refinement 1.390625 original
emotion 1.890625 original
lighting distinction 0.5859375 original
main object 1.9765625 original
object pairing 0.1162109375 original
richness 1.6171875 original


  4%|▍         | 146/3300 [00:51<25:28,  2.06it/s]

background 0.07373046875 distorted
clarity 1.4921875 distorted
color aesthetic 0.6796875 distorted
color brightness 0.275390625 distorted
detail realism 0.44921875 distorted
detail refinement 0.96875 distorted
emotion 0.4765625 distorted
lighting distinction 0.1572265625 distorted
main object 1.9765625 distorted
object pairing 0.73046875 distorted
richness 1.171875 distorted


  6%|▌         | 188/3300 [00:53<03:06, 16.66it/s]

background 0.05322265625 original
clarity 1.71875 original
color aesthetic 0.92578125 original
color brightness 0.0028533935546875 original
detail realism 0.51171875 original
detail refinement 1.3671875 original
emotion 0.9609375 original
lighting distinction 0.21875 original
main object 1.9375 original
object pairing 0.1171875 original
richness 1.5 original


  6%|▌         | 192/3300 [00:57<24:11,  2.14it/s]

background 0.09716796875 distorted
clarity 0.66015625 distorted
color aesthetic 0.62109375 distorted
color brightness 0.004730224609375 distorted
detail realism 0.2265625 distorted
detail refinement 0.50390625 distorted
emotion 0.9375 distorted
lighting distinction 0.3515625 distorted
main object 1.9921875 distorted
object pairing 0.4375 distorted
richness 1.4921875 distorted


  6%|▌         | 194/3300 [00:57<17:46,  2.91it/s]

background 0.16796875 original
clarity 1.8984375 original
color aesthetic 1.71875 original
color brightness 0.00061798095703125 original
detail realism 0.55859375 original
detail refinement 1.40625 original
emotion 1.5546875 original
lighting distinction 0.181640625 original
main object 1.9921875 original
object pairing 0.75390625 original
richness 1.3125 original


  6%|▌         | 198/3300 [01:01<32:06,  1.61it/s]

background 0.294921875 distorted
clarity 0.46875 distorted
color aesthetic 0.3203125 distorted
color brightness 1.5078125 distorted
detail realism 0.1845703125 distorted
detail refinement 0.3359375 distorted
emotion 0.71875 distorted
lighting distinction 0.076171875 distorted
main object 1.765625 distorted
object pairing 0.248046875 distorted
richness 0.703125 distorted


  6%|▋         | 210/3300 [01:02<06:24,  8.03it/s]

background 0.0250244140625 original
clarity 1.609375 original
color aesthetic 0.82421875 original
color brightness 1.578125 original
detail realism 0.546875 original
detail refinement 1.03125 original
emotion 1.765625 original
lighting distinction 0.984375 original
main object 1.9921875 original
object pairing 0.23046875 original
richness 1.4921875 original


  6%|▋         | 212/3300 [01:06<36:38,  1.40it/s]

background 0.044677734375 distorted
clarity 0.890625 distorted
color aesthetic 0.89453125 distorted
color brightness 0.0020904541015625 distorted
detail realism 0.68359375 distorted
detail refinement 0.57421875 distorted
emotion 0.25390625 distorted
lighting distinction 0.298828125 distorted
main object 2.0 distorted
object pairing 0.4375 distorted
richness 1.2109375 distorted


  7%|▋         | 220/3300 [01:07<11:07,  4.61it/s]

background 0.2216796875 original
clarity 1.6171875 original
color aesthetic 1.3984375 original
color brightness 0.0027313232421875 original
detail realism 0.33203125 original
detail refinement 1.2421875 original
emotion 1.7421875 original
lighting distinction 0.47265625 original
main object 2.0 original
object pairing 0.07958984375 original
richness 1.59375 original


  7%|▋         | 224/3300 [01:11<28:30,  1.80it/s]

background 0.0159912109375 distorted
clarity 0.8984375 distorted
color aesthetic 0.53125 distorted
color brightness 0.3515625 distorted
detail realism 0.0966796875 distorted
detail refinement 0.296875 distorted
emotion 0.8359375 distorted
lighting distinction 0.9296875 distorted
main object 1.296875 distorted
object pairing 0.349609375 distorted
richness 1.28125 distorted


  7%|▋         | 234/3300 [01:12<07:22,  6.93it/s]

background 0.06005859375 original
clarity 1.625 original
color aesthetic 1.3984375 original
color brightness 0.76171875 original
detail realism 0.7578125 original
detail refinement 1.3671875 original
emotion 0.8984375 original
lighting distinction 0.828125 original
main object 1.9140625 original
object pairing 0.146484375 original
richness 1.8515625 original


  7%|▋         | 236/3300 [01:14<20:18,  2.52it/s]

background 0.984375 distorted
clarity 0.8203125 distorted
color aesthetic 0.68359375 distorted
color brightness 0.69140625 distorted
detail realism 0.76171875 distorted
detail refinement 0.447265625 distorted
emotion 0.8203125 distorted
lighting distinction 0.84375 distorted
main object 1.7578125 distorted
object pairing 0.294921875 distorted
richness 1.4296875 distorted
background 0.2890625 original
clarity 1.8828125 original
color aesthetic 1.515625 original
color brightness 0.333984375 original
detail realism 0.8828125 original
detail refinement 1.46875 original
emotion 1.75 original
lighting distinction 0.330078125 original
main object 2.0 original
object pairing 0.041748046875 original
richness 1.5859375 original


  7%|▋         | 240/3300 [01:17<28:52,  1.77it/s]

background 0.77734375 distorted
clarity 1.28125 distorted
color aesthetic 0.73046875 distorted
color brightness 0.1201171875 distorted
detail realism 0.478515625 distorted
detail refinement 0.326171875 distorted
emotion 1.0390625 distorted
lighting distinction 0.0751953125 distorted
main object 1.15625 distorted
object pairing 0.458984375 distorted
richness 1.0234375 distorted


  7%|▋         | 244/3300 [01:17<15:41,  3.25it/s]

background 0.25 original
clarity 1.90625 original
color aesthetic 1.71875 original
color brightness 0.7890625 original
detail realism 0.10888671875 original
detail refinement 1.234375 original
emotion 1.9921875 original
lighting distinction 1.15625 original
main object 2.0 original
object pairing 0.8046875 original
richness 1.40625 original
background 0.53125 distorted
clarity 0.859375 distorted
color aesthetic 0.81640625 distorted
color brightness 0.6796875 distorted
detail realism 0.029541015625 distorted
detail refinement 0.310546875 distorted
emotion 0.74609375 distorted
lighting distinction 0.50390625 distorted
main object 0.953125 distorted
object pairing 0.470703125 distorted
richness 0.59375 distorted
background 0.1435546875 original
clarity 1.5625 original
color aesthetic 0.8984375 original
color brightness 0.0908203125 original
detail realism 0.3359375 original
detail refinement 1.3046875 original
emotion 1.4296875 original
lighting distinction 0.189453125 original
main objec

  8%|▊         | 248/3300 [01:25<46:57,  1.08it/s]  

background 0.27734375 distorted
clarity 0.8203125 distorted
color aesthetic 0.1484375 distorted
color brightness 0.56640625 distorted
detail realism 0.87109375 distorted
detail refinement 0.5546875 distorted
emotion 0.78515625 distorted
lighting distinction 0.05322265625 distorted
main object 1.9453125 distorted
object pairing 0.18359375 distorted
richness 1.0078125 distorted


  8%|▊         | 280/3300 [01:27<02:58, 16.93it/s]

background 0.1044921875 original
clarity 1.8671875 original
color aesthetic 1.90625 original
color brightness 0.314453125 original
detail realism 0.408203125 original
detail refinement 1.40625 original
emotion 1.5546875 original
lighting distinction 0.236328125 original
main object 2.0 original
object pairing 0.72265625 original
richness 1.3125 original


  9%|▊         | 282/3300 [01:31<33:51,  1.49it/s]

background 0.0791015625 distorted
clarity 1.921875 distorted
color aesthetic 0.70703125 distorted
color brightness 0.008544921875 distorted
detail realism 0.59765625 distorted
detail refinement 1.28125 distorted
emotion 0.78515625 distorted
lighting distinction 1.3046875 distorted
main object 2.0 distorted
object pairing 0.53125 distorted
richness 1.125 distorted


  9%|▊         | 284/3300 [01:31<24:34,  2.05it/s]

background 0.47265625 original
clarity 1.984375 original
color aesthetic 1.9453125 original
color brightness 0.41015625 original
detail realism 0.70703125 original
detail refinement 1.6015625 original
emotion 1.2578125 original
lighting distinction 0.84765625 original
main object 2.0 original
object pairing 0.8671875 original
richness 1.34375 original


  9%|▊         | 288/3300 [01:33<23:19,  2.15it/s]

background 0.0224609375 distorted
clarity 1.8359375 distorted
color aesthetic 0.8984375 distorted
color brightness 0.0703125 distorted
detail realism 0.84375 distorted
detail refinement 1.25 distorted
emotion 0.8203125 distorted
lighting distinction 0.10400390625 distorted
main object 2.0 distorted
object pairing 0.271484375 distorted
richness 1.2890625 distorted


  9%|▉         | 292/3300 [01:33<12:58,  3.87it/s]

background 0.00860595703125 original
clarity 1.953125 original
color aesthetic 1.03125 original
color brightness 0.8515625 original
detail realism 0.546875 original
detail refinement 1.328125 original
emotion 1.9296875 original
lighting distinction 0.4609375 original
main object 2.0 original
object pairing 0.1962890625 original
richness 1.5 original


  9%|▉         | 296/3300 [01:35<17:38,  2.84it/s]

background 0.80078125 distorted
clarity 1.875 distorted
color aesthetic 0.7578125 distorted
color brightness 0.6796875 distorted
detail realism 0.88671875 distorted
detail refinement 1.03125 distorted
emotion 0.9453125 distorted
lighting distinction 0.515625 distorted
main object 2.0 distorted
object pairing 0.051513671875 distorted
richness 1.234375 distorted


100%|██████████| 3300/3300 [03:37<00:00, 15.17it/s]


In [None]:
# rater both score different than last time, i got it, aaaaaa it is so simple, boh images are regenerated!!!

In [6]:
assert len(rater_results) == len(dataset["rater"])

In [7]:
dataset = dataset.remove_columns("rater").add_column("rater", rater_results)  

In [13]:
dataset[321]["prompt_distorted"]

'A black cow with horns stands on a field, but its horns melt into its head, legs warp into stubs, and the field dissolves into pixelated static; background is absent or glitched with muddy smears; the image feels randomly generated, lacking intent, with obvious distortion even when scaled down.'

In [15]:
dataset.push_to_hub("weathon/aas_benchmark_2", private=True)

Uploading the dataset shards:   0%|          | 0/15 [00:00<?, ? shards/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

{"timestamp":"2025-10-20T08:25:28.288603Z","level":"WARN","fields":{"message":"Status Code: 500. Retrying...","request_id":"01K80BHEYCM1E1PWB6FJNP10PX"},"filename":"/home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-10-20T08:25:28.288646Z","level":"WARN","fields":{"message":"Retry attempt #0. Sleeping 2.55850007s before the next attempt"},"filename":"/root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}
{"timestamp":"2025-10-20T08:26:01.200158Z","level":"WARN","fields":{"message":"Status Code: 500. Retrying...","request_id":"01K80BJF23X6KKK9QGRTGXPCXC"},"filename":"/home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-10-20T08:26:01.200202Z","level":"WARN","fields":{"message":"Retry attempt #1. Sleeping 5.847714095s before the next attempt"},"filename":"/root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-r

KeyboardInterrupt: 

In [10]:
dataset.save_to_disk("aas_benchmark_final.hf")

Saving the dataset (0/15 shards):   0%|          | 0/3300 [00:00<?, ? examples/s]

In [426]:
dataset.save_to_disk("aas_benchmark_2_with_blip")

Saving the dataset (0/15 shards):   0%|          | 0/3300 [00:00<?, ? examples/s]

In [1]:
from datasets import load_from_disk
dataset = load_from_disk("aas_benchmark_final.hf")
dataset.push_to_hub("weathon/aas_benchmark_final", private=False)

Uploading the dataset shards:   0%|          | 0/15 [00:00<?, ? shards/s]

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Map:   0%|          | 0/220 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/5 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

CommitInfo(commit_url='https://huggingface.co/datasets/weathon/aas_benchmark_final/commit/b989f640e7621e779c8e62efef8c26f77816e99b', commit_message='Upload dataset', commit_description='', oid='b989f640e7621e779c8e62efef8c26f77816e99b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/weathon/aas_benchmark_final', endpoint='https://huggingface.co', repo_type='dataset', repo_id='weathon/aas_benchmark_final'), pr_revision=None, pr_num=None)

In [None]:
import time
for i, sample in enumerate(tqdm.tqdm(dataset)):
    if i not in idx_of_interest:
        continue
    else: 
        print(i)
        time.sleep(1)
        print(i) 

In [424]:
dataset

Dataset({
    features: ['image_original', 'image_distorted', 'index', 'prompt_original', 'prompt_distorted', 'selected_dims', 'llm_judge', 'hpsv2_reward', 'hpsv3_reward', 'image_reward', 'rater', 'llm_selected', 'blip_selected', 'model', 'blip_score'],
    num_rows: 3300
})

In [None]:
from hpsv3 import HPSv3RewardInferencer 

inferencer = HPSv3RewardInferencer(device='cuda:2')

import torch
def hpsv3_reward(sample, i): 
    if i not in idx_of_interest:
        return sample["hpsv3_reward"]
    images_part = [sample["image_original"], sample["image_original"], sample["image_distorted"],  sample["image_distorted"]]
    prompts_part = [
        sample["prompt_original"],
        sample["prompt_distorted"],
        sample["prompt_original"],
        sample["prompt_distorted"]
    ] 
    with torch.no_grad(): 
        with torch.cuda.amp.autocast():
            rewards = inferencer.reward(prompts=prompts_part, image_paths=images_part)
    results = {
        "hpsv3_oiop": rewards[0], # original image, original prompt
        "hpsv3_oidp": rewards[1], # original image, distorted prompt
        "hpsv3_diop": rewards[2], # distorted image, original prompt
        "hpsv3_didp": rewards[3], # distorted image, distorted prompt 
    }
    return results 
  
rewards = []
import tqdm
for sample in tqdm.tqdm(dataset):
    reward = hpsv3_reward(sample)
    rewards.append(reward)


ImportError: cannot import name 'DistributedTensorGatherer' from 'transformers.trainer' (/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/trainer.py)

In [2]:
from datasets import load_dataset, load_from_disk
# dataset = load_dataset("weathon/aas_benchmark", split="train")
# dataset = load_from_disk("aas_benchmark_2_with_blip_2")

In [None]:
# import json
# with open("../hpsv2_rewards.json", "r") as f:
#     hpsv2_rewards = json.load(f)

In [None]:
# dataset.remove_columns("hpsv2_reward").add_column("hpsv2_reward", hpsv2_rewards)

Dataset({
    features: ['image_original', 'image_distorted', 'index', 'prompt_original', 'prompt_distorted', 'selected_dims', 'llm_judge', 'hpsv3_reward', 'image_reward', 'rater', 'llm_selected', 'blip_selected', 'model', 'blip_score', 'hpsv2_reward'],
    num_rows: 3300
})

In [None]:
# dataset.save_to_disk("aas_benchmark_2_with_blip_2")

Saving the dataset (0/15 shards):   0%|          | 0/3300 [00:00<?, ? examples/s]

In [3]:
dataset = load_from_disk("aas_benchmark_2_with_blip_2")

In [6]:

import ImageReward as RM
model = RM.load("ImageReward-v1.0")

import torch
def image_reward_reward(sample, i):
    if i not in idx_of_interest:
        return sample["image_reward"]
    images_part = [sample["image_original"], sample["image_original"], sample["image_distorted"],  sample["image_distorted"]]
    prompts_part = [
        sample["prompt_original"],
        sample["prompt_distorted"],
        sample["prompt_original"],
        sample["prompt_distorted"]
    ] 
    rewards = []

    with torch.no_grad():
        with torch.cuda.amp.autocast():
            for image, prompt in zip(images_part, prompts_part):
                reward = model.score(prompt, image)
                rewards.append(reward)

    results = {
            "image_reward_oiop": float(rewards[0]), # original image, original prompt
            "image_reward_oidp": float(rewards[1]), # original image, distorted prompt
            "image_reward_diop": float(rewards[2]), # distorted image, original prompt
            "image_reward_didp": float(rewards[3]), # distorted image, distorted prompt 
    }
    print(results)
    return results


rewards = []
import tqdm
for i, sample in enumerate(tqdm.tqdm(dataset)):
    reward = image_reward_reward(sample, i) 
    rewards.append(reward)

load checkpoint from /home/wg25r/.cache/ImageReward/ImageReward.pt
checkpoint loaded


  0%|          | 12/3300 [00:01<06:31,  8.40it/s]

{'image_reward_oiop': 1.6318359375, 'image_reward_oidp': 1.4033203125, 'image_reward_diop': -1.9599609375, 'image_reward_didp': -0.068115234375}


  1%|          | 32/3300 [00:02<04:22, 12.45it/s]

{'image_reward_oiop': 1.6728515625, 'image_reward_oidp': 1.626953125, 'image_reward_diop': 0.37451171875, 'image_reward_didp': 0.35888671875}


  1%|▏         | 48/3300 [00:03<04:25, 12.24it/s]

{'image_reward_oiop': 1.8701171875, 'image_reward_oidp': 0.00301361083984375, 'image_reward_diop': 1.908203125, 'image_reward_didp': 1.80078125}


  2%|▏         | 58/3300 [00:04<04:32, 11.92it/s]

{'image_reward_oiop': 1.0, 'image_reward_oidp': 1.470703125, 'image_reward_diop': -0.296142578125, 'image_reward_didp': 1.41796875}


  2%|▏         | 80/3300 [00:06<04:23, 12.22it/s]

{'image_reward_oiop': 1.7587890625, 'image_reward_oidp': 1.572265625, 'image_reward_diop': 1.1142578125, 'image_reward_didp': 1.4326171875}


  3%|▎         | 88/3300 [00:06<04:31, 11.84it/s]

{'image_reward_oiop': 1.3564453125, 'image_reward_oidp': 0.5126953125, 'image_reward_diop': 0.451416015625, 'image_reward_didp': 1.4580078125}


  3%|▎         | 92/3300 [00:07<05:32,  9.66it/s]

{'image_reward_oiop': 0.6748046875, 'image_reward_oidp': 1.412109375, 'image_reward_diop': 0.230224609375, 'image_reward_didp': 0.98095703125}


  3%|▎         | 96/3300 [00:07<05:24,  9.88it/s]

{'image_reward_oiop': 1.9345703125, 'image_reward_oidp': 1.90234375, 'image_reward_diop': 1.77734375, 'image_reward_didp': 1.892578125}


  3%|▎         | 106/3300 [00:08<04:37, 11.49it/s]

{'image_reward_oiop': 1.4287109375, 'image_reward_oidp': 1.275390625, 'image_reward_diop': -0.2071533203125, 'image_reward_didp': 0.492919921875}


  4%|▍         | 124/3300 [00:09<04:19, 12.25it/s]

{'image_reward_oiop': 0.180908203125, 'image_reward_oidp': -0.5830078125, 'image_reward_diop': -0.488037109375, 'image_reward_didp': 1.2646484375}
{'image_reward_oiop': 1.3623046875, 'image_reward_oidp': 0.7373046875, 'image_reward_diop': -0.1895751953125, 'image_reward_didp': 1.1044921875}


  4%|▍         | 128/3300 [00:10<05:54,  8.96it/s]

{'image_reward_oiop': 1.3544921875, 'image_reward_oidp': 1.49609375, 'image_reward_diop': 1.0849609375, 'image_reward_didp': 1.458984375}


  4%|▍         | 146/3300 [00:11<03:59, 13.18it/s]

{'image_reward_oiop': 0.483154296875, 'image_reward_oidp': -0.2220458984375, 'image_reward_diop': -0.38818359375, 'image_reward_didp': 0.45947265625}


  6%|▌         | 192/3300 [00:14<04:13, 12.27it/s]

{'image_reward_oiop': 0.2802734375, 'image_reward_oidp': -0.152587890625, 'image_reward_diop': -0.6982421875, 'image_reward_didp': 1.2197265625}


  6%|▌         | 198/3300 [00:14<04:27, 11.62it/s]

{'image_reward_oiop': 1.837890625, 'image_reward_oidp': 0.2264404296875, 'image_reward_diop': -0.7275390625, 'image_reward_didp': 0.849609375}


  6%|▋         | 214/3300 [00:16<04:07, 12.45it/s]

{'image_reward_oiop': 1.798828125, 'image_reward_oidp': 1.5849609375, 'image_reward_diop': -0.7998046875, 'image_reward_didp': 0.78515625}


  7%|▋         | 224/3300 [00:16<04:15, 12.03it/s]

{'image_reward_oiop': 1.6865234375, 'image_reward_oidp': 1.59765625, 'image_reward_diop': 0.6123046875, 'image_reward_didp': 1.1005859375}


  7%|▋         | 236/3300 [00:17<04:40, 10.93it/s]

{'image_reward_oiop': 1.162109375, 'image_reward_oidp': 0.875, 'image_reward_diop': 0.144287109375, 'image_reward_didp': 1.34765625}


  7%|▋         | 240/3300 [00:18<04:49, 10.58it/s]

{'image_reward_oiop': 1.2841796875, 'image_reward_oidp': 0.1865234375, 'image_reward_diop': 0.970703125, 'image_reward_didp': 1.02734375}


  7%|▋         | 244/3300 [00:18<03:59, 12.73it/s]

{'image_reward_oiop': 1.26171875, 'image_reward_oidp': 0.260498046875, 'image_reward_diop': 0.1597900390625, 'image_reward_didp': -1.017578125}


  8%|▊         | 248/3300 [00:19<05:33,  9.16it/s]

{'image_reward_oiop': 0.62890625, 'image_reward_oidp': -1.2314453125, 'image_reward_diop': 0.529296875, 'image_reward_didp': 1.076171875}


  9%|▊         | 284/3300 [00:21<03:48, 13.22it/s]

{'image_reward_oiop': 1.2705078125, 'image_reward_oidp': 0.59326171875, 'image_reward_diop': 1.00390625, 'image_reward_didp': 0.9697265625}


  9%|▊         | 288/3300 [00:21<04:20, 11.55it/s]

{'image_reward_oiop': 1.619140625, 'image_reward_oidp': 0.8466796875, 'image_reward_diop': 0.352294921875, 'image_reward_didp': 0.86962890625}


  9%|▉         | 296/3300 [00:22<04:04, 12.29it/s]

{'image_reward_oiop': 1.01953125, 'image_reward_oidp': 0.363525390625, 'image_reward_diop': 1.1728515625, 'image_reward_didp': 1.00390625}


100%|██████████| 3300/3300 [02:25<00:00, 22.72it/s]


In [7]:
assert len(rewards) == len(dataset["image_reward"])

In [9]:
dataset = dataset.remove_columns("image_reward").add_column("image_reward", rewards)

In [12]:
# dataset.push_to_hub("weathon/aas_benchmark_2", private=True)

In [13]:
dataset.save_to_disk("aas_benchmark_2_with_blip")

Saving the dataset (0/15 shards):   0%|          | 0/3300 [00:00<?, ? examples/s]

In [15]:
import pickle
with open("../hpsv3_rewards.pkl", "rb") as f:
    hpsv3_rewards = pickle.load(f)

In [16]:
assert len(hpsv3_rewards) == len(dataset["hpsv3_reward"])

In [19]:
hpsv3_rewards_2 = []
for i in hpsv3_rewards:
    hpsv3_rewards_2.append({
        k:i.cpu().detach().tolist() if isinstance(i, torch.Tensor) else i
        for k, i in i.items()
    })

In [20]:
dataset = dataset.remove_columns("hpsv3_reward").add_column("hpsv3_reward", hpsv3_rewards_2)

In [21]:
dataset.save_to_disk("aas_benchmark_2_with_blip")

Saving the dataset (0/15 shards):   0%|          | 0/3300 [00:00<?, ? examples/s]