In [1]:
from datasets import load_dataset
dataset_paths = ["weathon/aas_benchmark-nano-banana","weathon/aas_benchmark-stable_diffusion_xl","weathon/aas_benchmark-flux_krea","weathon/aas_benchmark-flux_dev","weathon/aas_benchmark-grpo_flux","weathon/aas_benchmark-playground","weathon/aas_benchmark-sd3_medium_grpo","weathon/aas_benchmark-sd3_medium_grpo_geneval","weathon/aas_benchmark-stable_diffusion_3.5_medium","weathon/aas_benchmark-sd3_medium_grpo","weathon/aas_benchmark-dance_flux"]

In [2]:
datasets = [load_dataset(path, split="train") for path in dataset_paths]

In [3]:
from datasets import concatenate_datasets
dataset = concatenate_datasets(datasets)

In [4]:
from PIL import Image
from pydantic import BaseModel
from io import BytesIO
import base64
from typing import Literal
from openai import OpenAI 


class JudgeResponse(BaseModel):
    reasoning: str
    main_concepts: int
    special_effects: int


def encode_image(image: Image.Image) -> str:
    buffer = BytesIO()
    image.save(buffer, format="PNG")
    return base64.b64encode(buffer.getvalue()).decode("utf-8")


def ensure_percentage(value: int) -> None:
    if not (0 <= value <= 100):
        raise ValueError(f"LLM returned value outside 0-100 range: {value}")


In [22]:
import os
import dotenv
dotenv.load_dotenv()
deepinfra_api_key = os.getenv("DEEPNFRA_API_KEY")
openai = OpenAI(
    api_key=deepinfra_api_key,
    base_url="https://api.deepinfra.com/v1/openai",
)



In [23]:
def judge(image: Image.Image, original_prompt: str, distorted_prompt: str) -> JudgeResponse:
    encoded = encode_image(image)
    messages = [
        {"role": "system", "content": "You are a strict image judge. Reply with a JSON object that matches the provided schema."},
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": (
                        f"Original prompt:\n{original_prompt}\n\n"
                        f"Distorted prompt:\n{distorted_prompt}\n Answer the following using integers between 0 and 100 inclusive. IMPORTANT: If visual effects, styles, or distortions make the main concept harder to see but it is still present, DO NOT decrease the main concept score. 1. Main concept (0-100): score how clearly the main subjects or scenes from the ORIGINAL prompt appear, regardless of added effects that may partially obscure them. 2. Special effects (0-100): score how well the stylistic details, modifiers, and effects described in the distorted prompt appear. The rating should not be binary (just 0 or 100) but smooth, only rate 100 if it follows all effetcs and only rate 0 if none of the effects are visible. To do list for the effects, list the main effects mentioned in the distorted prompt and check if each of them is present, then rate based on the fraction of effects that are present. You should do the thinking in the reasoning part, list all effects mentioned in the distorted prompt, and see if the effects are in the image." 
                    ),
                },
                {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encoded}"}},
            ],
        },
    ]
    for attempt in range(5):
        try:
            response = openai.chat.completions.parse(
                model="Qwen/Qwen3-VL-30B-A3B-Instruct",
                messages=messages,
                response_format=JudgeResponse,
                temperature=0.1,
            )
            parsed = response.choices[0].message.parsed
            ensure_percentage(parsed.main_concepts)
            ensure_percentage(parsed.special_effects)
            return parsed
        except Exception as exc:
            if attempt == 4:
                raise
            print(f"retrying llm judge due to error: {exc}")
    


In [24]:
dataset

Dataset({
    features: ['image_original', 'image_distorted', 'index', 'prompt_original', 'prompt_distorted', 'selected_dims', 'hpsv2', 'llm_judge', 'model'],
    num_rows: 3300
})

In [25]:
def process_example(example):
    original_judge = judge(example["image_original"], example["prompt_original"], example["prompt_distorted"])
    distorted_judge = judge(example["image_distorted"], example["prompt_original"], example["prompt_distorted"])

    example["llm_judge"] = {
            "llm_original_reasoning": original_judge.reasoning,
            "llm_original_main_concepts": original_judge.main_concepts,
            "llm_original_special_effects": original_judge.special_effects,
            "llm_distorted_reasoning": distorted_judge.reasoning,
            "llm_distorted_main_concepts": distorted_judge.main_concepts,
            "llm_distorted_special_effects": distorted_judge.special_effects,
        }
    return example

In [26]:
dataset = dataset.map(process_example, batched=False, num_proc=100)

Map (num_proc=100):   0%|          | 0/3300 [00:00<?, ? examples/s]

retrying llm judge due to error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=16384, prompt_tokens=1374, total_tokens=17758, completion_tokens_details=None, prompt_tokens_details=None, estimated_cost=0.01661862)
retrying llm judge due to error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=16384, prompt_tokens=1338, total_tokens=17722, completion_tokens_details=None, prompt_tokens_details=None, estimated_cost=0.01660818)
retrying llm judge due to error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=16384, prompt_tokens=1347, total_tokens=17731, completion_tokens_details=None, prompt_tokens_details=None, estimated_cost=0.01661079)
retrying llm judge due to error: Could not parse response content as the length limit was reached - CompletionUsage(completion_tokens=16384, prompt_tokens=1348, total_tokens=17732, completion_toke

In [None]:
dataset

In [None]:
dataset.push_to_hub("weathon/aas_benchmark", private=True)

In [1]:
from datasets import load_dataset
dataset = load_dataset("weathon/aas_benchmark", split="train")

In [2]:
from hpsv3 import HPSv3RewardInferencer

inferencer = HPSv3RewardInferencer(device='cuda:2')

Flash Attention is not installed. Falling to SDPA.


`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of Qwen2VLRewardModelBT were not initialized from the model checkpoint at Qwen/Qwen2-VL-7B-Instruct and are newly initialized: ['rm_head.0.bias', 'rm_head.0.weight', 'rm_head.3.bias', 'rm_head.3.weight', 'rm_head.5.bias', 'rm_head.5.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
import torch
def hpsv3_reward(sample):
    images = []
    prompts = [] 
    images_part = [sample["image_original"], sample["image_original"], sample["image_distorted"],  sample["image_distorted"]]
    prompts_part = [
        sample["prompt_original"],
        sample["prompt_distorted"],
        sample["prompt_original"],
        sample["prompt_distorted"]
    ] 
    images.extend(images_part)
    prompts.extend(prompts_part)

    with torch.no_grad():
        rewards = inferencer.reward(prompts=prompts, image_paths=images)
    results = []
    for i in range(0, len(rewards), 4):
        results.append({
            "hpsv3_oiop": rewards[i], # original image, original prompt
            "hpsv3_oidp": rewards[i+1], # original image, distorted prompt
            "hpsv3_diop": rewards[i+2], # distorted image, original prompt
            "hpsv3_didp": rewards[i+3], # distorted image, distorted prompt 
        })
    sample["hpsv3_reward"] = results
    return sample
  


In [None]:
hpsv3_reward(dataset[0])   

{'image_original': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1024x1024>,
 'image_distorted': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=1024x1024>,
 'index': 0,
 'prompt_original': 'A standing toilet in a bathroom stall next to a trash can.',
 'prompt_distorted': 'A standing toilet in a bathroom stall beside a trash can, rendered with no light or shadow, appearing random and unfinished, lacking clear intent or design, with flat, indistinct surfaces and no depth or texture.',
 'selected_dims': '["richness", "lighting distinction"]',
 'hpsv2': {'distorted': 0.0,
  'distorted_original_prompt': 0.0,
  'original': 0.0,
  'original_distorted_prompt': 0.0},
 'llm_judge': {'llm_distorted_main_concepts': 100,
  'llm_distorted_reasoning': 'The original prompt describes a standing toilet in a bathroom stall next to a trash can. The distorted prompt adds stylistic effects: no light or shadow, random and unfinished appearance, lack of clear intent or design, flat and indistinct

In [None]:
dataset = dataset.map(hpsv3_reward)

Map:   0%|          | 0/3300 [00:00<?, ? examples/s]