In [1]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
import torch

model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-4B-Instruct", dtype=torch.bfloat16, device_map="cuda"
)

processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-4B-Instruct")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
# %%
import os
import torch
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForImageTextRetrieval


from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
train_ds = load_dataset("zai-org/VisionRewardDB-Image", split='train[:40000]')
test_ds = load_dataset("zai-org/VisionRewardDB-Image", split='train[40000:]')

import io, math, random
import numpy as np
from PIL import Image, ImageFilter, ImageEnhance
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd

df = pd.read_csv("rules.csv")

import pandas as pd
import re

df.columns = df.columns.str.strip()
df['Dimension'] = df['Dimension'].ffill()

df['dim_key'] = df['Dimension'].apply(lambda x: re.search(r'\((.*?)\)', x).group(1) if re.search(r'\((.*?)\)', x) else x)

guide = {
    dim_key: {
        int(row['Score']): row['Option'] + ": " +str(row['Description']).strip()
        for _, row in group.iterrows()
    }
    for dim_key, group in df.groupby('dim_key')
}

dims = {k: v for k, v in guide.items() if k not in ["unsafe type", "hands", "face", "body", "safety", "lighting aesthetic", "symmetry"]}.keys()
dims = list(dims)
dim_min = {i:min(guide[i].keys()) for i in guide.keys()}

# %%

Loading dataset shards:   0%|          | 0/19 [00:00<?, ?it/s]

In [3]:
processor.tokenizer.padding_side = "left"

In [29]:
import json
with open("prompts.json", "r") as f:
    prompt_dict = json.load(f)
    
def format_data(sample):
    images = []
    dims_selected = []
    # print(len(sample["image"]), len(sample["annotation"]))
    for image in range(len(sample['image'])):
        images.append(sample['image'][image])
        try:
            if random.random()>0.5:
                # sample a dim with score>=0 
                dims_selected.append(random.choice(list([i for i in dims if sample['annotation'][image][i]>=0])))
            else:
                # sample a dim with score<0
                dims_selected.append(random.choice(list([i for i in dims if sample['annotation'][image][i]<0])))
        except IndexError:
            dims_selected.append(random.choice(dims))
            

    prompts = [prompt_dict[dim] for i, dim in enumerate(dims_selected)]
    images = list(sample['image'])
    n_images = len(images) 
    n_prompts = len(prompts) 
    messages = [[
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": images[i].convert("RGB").resize((512, 512)),
                },
                {"type": "text", "text": prompt},
            ],
        }
    ] for i, prompt in enumerate(prompts)]

    inputs = processor.apply_chat_template(
        messages,
        add_generation_prompt=True, 
        return_dict=True,
        return_tensors="pt",
        padding=True,
        tokenize=True,
    ) 
    inputs["pixel_values"] = inputs["pixel_values"].unsqueeze(0)
     #torch.stack(inputs["pixel_values"].chunk(n_images, dim=0))
    answers = [1 if i[dim]<0 else (0 if i[dim]>0 else 0.5)for i, dim in zip(sample["annotation"], dims_selected)]
    labels = torch.tensor(answers)
    inputs['labels'] = labels
    inputs['dim'] = [dims.index(dim) for dim in dims_selected]
    return inputs

In [30]:
23552/23

1024.0

In [31]:
train_ds = train_ds.with_transform(format_data)
test_ds = test_ds.with_transform(format_data)
train_ds[0:23].pixel_values.shape

torch.Size([1, 23552, 1536])

In [32]:
# import torch
# with torch.no_grad(): 
#     print(model(**train_ds[0:2].to("cuda")).hidden_states.shape) 

In [33]:
# train_ds[0:2]["input_ids"][1]

In [34]:
from transformers import PreTrainedModel, PretrainedConfig

class Rater(PreTrainedModel):
    def __init__(self, backbone):
      super().__init__(PretrainedConfig())
      self.backbone = backbone
      self.head = torch.nn.Sequential(
         torch.nn.Linear(2560, 2560 * 4),
          torch.nn.ReLU(),
          torch.nn.Linear(2560 * 4, 1),
      )

    def forward(self, pixel_values, input_ids, attention_mask, image_grid_thw, dim, labels=None):
      hidden_states = self.backbone(
          pixel_values=pixel_values,
          input_ids=input_ids,
          attention_mask=attention_mask,
          image_grid_thw=image_grid_thw,
      ).hidden_states
      
      pooled_output = hidden_states[:, -1, :]
      logits = self.head(pooled_output)
      output = {'logits': logits.squeeze(-1)}
      if labels is not None: 
          bce_loss = torch.nn.functional.binary_cross_entropy_with_logits(
              logits.squeeze(-1), labels.float()
          )
          output['loss'] = bce_loss
      return output
my_rater = Rater(model).to("cuda", dtype=torch.bfloat16)

In [35]:
# lora
from peft import get_peft_model, LoraConfig, TaskType
lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.01,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    modules_to_save=["head"]
)

In [36]:
my_rater = get_peft_model(my_rater, lora_config) 



In [37]:
my_rater = my_rater.to("cuda", dtype=torch.bfloat16)

In [38]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)

In [None]:
test_ds[:2]["pixel_values"].shape zhijieshuchunebuyaoyongtoushouruak

torch.Size([1, 2048, 1536])

In [39]:
with torch.no_grad(): 
    with torch.autocast("cuda", dtype=torch.bfloat16):
        batch = next(iter(test_loader))
        batch = {k: v.to("cuda") if isinstance(v, torch.Tensor) else v for k, v in batch.items()}
        print(my_rater(**batch))

IndexError: index 1 is out of bounds for dimension 0 with size 1

In [None]:
# my_rater = my_rater.cpu()  

In [None]:
from transformers import TrainingArguments
import os
training_args = TrainingArguments(
    output_dir="Qwen-3-VL-Rater",
    learning_rate=3e-6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=20,
    weight_decay=0.001,
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=1,
    load_best_model_at_end=True,
    push_to_hub=True,
    max_grad_norm=1.0, 
    remove_unused_columns=False,
    fp16=True,
    warmup_ratio=0.01,
    lr_scheduler_type="cosine",
    report_to="wandb" 
)

from transformers import Trainer

trainer = Trainer(
    model=my_rater,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    processing_class=processor,
)

# %%
trainer.train() 

/home/wg25r/miniconda/envs/neg/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/wg25r/miniconda/envs/neg/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 151645, 'pad_token_id': 151643}.
[34m[1mwandb[0m: Currently logged in as: [33mwguo6358[0m ([33m3dsmile[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


torch.Size([8, 1024, 1536])
torch.Size([8, 1024, 1536])
torch.Size([8, 1024, 1536])
tensor([[ 1, 32, 32],
        [ 1, 32, 32],
        [ 1, 32, 32],
        [ 1, 32, 32],
        [ 1, 32, 32],
        [ 1, 32, 32],
        [ 1, 32, 32],
        [ 1, 32, 32]], device='cuda:0')
torch.Size([8192, 1024]) torch.Size([8192, 1024])
tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], device='cuda:2')
tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]], device='cuda:1')


RuntimeError: Caught RuntimeError in replica 1 on device 1.
Original Traceback (most recent call last):
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 99, in _worker
    output = module(*input, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/peft/peft_model.py", line 881, in forward
    return self.get_base_model()(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
  File "/tmp/ipykernel_3912072/3983032265.py", line 14, in forward
    hidden_states = self.backbone(
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/utils/generic.py", line 927, in wrapper
    outputs = func(self, *args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 1376, in forward
    outputs = self.model(
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/utils/generic.py", line 927, in wrapper
    outputs = func(self, *args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 1170, in forward
    image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 1093, in get_image_features
    image_embeds, deepstack_image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1773, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1784, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 746, in forward
    pos_embeds = self.fast_pos_embed_interpolate(grid_thw)
  File "/home/wg25r/miniconda/envs/neg/lib/python3.10/site-packages/transformers/models/qwen3_vl/modeling_qwen3_vl.py", line 724, in fast_pos_embed_interpolate
    pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1)
RuntimeError: cannot reshape tensor of 0 elements into shape [0, 0, 2, 0, 2, -1] because the unspecified dimension size -1 can be any value and is ambiguous
