In [None]:
pip install datasets transformers torch pandas



In [None]:
# Install Qwen-VL utilities and update transformers/accelerate to support the latest models
!pip install qwen-vl-utils
!pip install -U transformers accelerate

Collecting qwen-vl-utils
  Downloading qwen_vl_utils-0.0.14-py3-none-any.whl.metadata (9.0 kB)
Collecting av (from qwen-vl-utils)
  Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Downloading qwen_vl_utils-0.0.14-py3-none-any.whl (8.1 kB)
Downloading av-16.0.1-cp312-cp312-manylinux_2_28_x86_64.whl (40.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m40.5/40.5 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av, qwen-vl-utils
Successfully installed av-16.0.1 qwen-vl-utils-0.0.14


In [None]:
# ==========================================
# 0. CLEANUP & INSTALL
# ==========================================
import torch
import gc
gc.collect()
torch.cuda.empty_cache()

import sys
import os
import time
import pandas as pd
from torch.utils.data import DataLoader
# Using AutoModelForImageTextToText is safer for newer Transformers versions
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info
from tqdm import tqdm
from PIL import Image

# ==========================================
# 1. SETUP & READER
# ==========================================
from google.colab import drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

sys.path.append('/content/drive/My Drive/AdMIRe_Project/')
import admire_dataset
import importlib
importlib.reload(admire_dataset)
from admire_dataset import AdMIReReader

EXTRACT_PATH = "/content/admire_data"

# ==========================================
# 2. LOAD QWEN-VL (LOW RES)
# ==========================================
MODEL_ID = "Qwen/Qwen2-VL-2B-Instruct"

print(f"‚è≥ Loading {MODEL_ID} (Low Res Mode)...")

try:
    model = AutoModelForImageTextToText.from_pretrained(
        MODEL_ID,
        dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )

    # üëá FORCE LOW RESOLUTION (256px)
    processor = AutoProcessor.from_pretrained(
        MODEL_ID,
        min_pixels=256*28*28,
        max_pixels=256*28*28,
        trust_remote_code=True
    )
except Exception as e:
    print(f"‚ùå Error loading model: {e}")
    raise

# ==========================================
# 3. COLLATE FUNCTION
# ==========================================
def qwen_collate_fn(batch):
    texts, images, labels = [], [], []

    for item in batch:
        content = []
        for path in item['image_paths']:
            if path == "MISSING":
                content.append({"type": "image", "image": Image.new('RGB', (224, 224))})
            else:
                content.append({"type": "image", "image": path})

        prompt = (
            f"Context: {item['text']}\n"
            "Task: The images above are labeled Image 1, 2, 3, 4, 5. "
            "Which single image best represents the idiom? "
            "Respond ONLY with the correct image number (1, 2, 3, 4, or 5)."
        )
        content.append({"type": "text", "text": prompt})

        conversation = [{"role": "user", "content": content}]

        text_in = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
        img_in, _ = process_vision_info(conversation)

        texts.append(text_in)
        images.extend(img_in)
        labels.append(item['label'])

    inputs = processor(text=texts, images=images, padding=True, return_tensors="pt")
    return inputs, labels

# ==========================================
# 4. RUN TEST (5 Samples)
# ==========================================
# Load & Slice Dataset
try:
    dataset = AdMIReReader(data_root_path=EXTRACT_PATH, split="Train", mode="qwen")
    dataset.df = dataset.df.head(5)
    print(f"‚úÖ TEST MODE: Dataset limited to {len(dataset)} items.")
except Exception as e:
    print(f"‚ùå Data Error: {e}")
    raise

BATCH_SIZE = 1
loader = DataLoader(dataset, batch_size=BATCH_SIZE, collate_fn=qwen_collate_fn, num_workers=0)

results = []
print(f"\nüöÄ Starting Qwen2 Low-Res Test...")

model.eval()
with torch.no_grad():
    for inputs, labels in tqdm(loader, desc="Testing"):

        start = time.time()
        inputs = inputs.to(model.device)

        generated_ids = model.generate(**inputs, max_new_tokens=5)
        duration = time.time() - start

        # Decode
        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
        ]
        output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True)[0].strip()

        # Parse
        import re
        digits = re.findall(r'\d+', output_text)
        if digits:
            pred_num = int(digits[0])
            pred_index = pred_num - 1
        else:
            pred_num = -1
            pred_index = -1

        truth_index = labels[0] # Direct access (labels is a list)
        truth_num = truth_index + 1

        is_correct = (pred_index == truth_index)

        results.append({
            "True_Answer": truth_num,
            "Model_Prediction": output_text,
            "Result": "TRUE" if is_correct else "FALSE",
            "Time_Sec": round(duration, 2)
        })

# ==========================================
# 5. REPORT
# ==========================================
df = pd.DataFrame(results)
print("\n" + "="*40)
print("üìä QWEN2 LOW-RES REPORT")
print("="*40)
print(df)
print("="*40)

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
‚è≥ Loading Qwen/Qwen3-VL-2B-Instruct (Low Res Mode)...


Loading weights:   0%|          | 0/625 [00:00<?, ?it/s]

üïµÔ∏è Scanning /content/admire_data for Train data...
‚úÖ Loaded: /content/admire_data/train/subtask_a_train.tsv
‚úÖ TEST MODE: Dataset limited to 5 items.

üöÄ Starting Qwen2 Low-Res Test...


Testing:   0%|          | 0/5 [00:00<?, ?it/s]