In [None]:
# ==========================================
# CELL 1: ENVIRONMENT SETUP
# ==========================================
import sys
import os
import shutil
import glob
import zipfile
import torch
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm # Use notebook version for better UI
from google.colab import drive

# Install dependencies (only if missing)
try:
    import qwen_vl_utils
except ImportError:
    print("📦 Installing Libraries...")
    !pip install -q git+https://github.com/huggingface/transformers accelerate bitsandbytes qwen-vl-utils pandas pillow

# Mount Drive
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

project_path = '/content/drive/MyDrive/AdMIRe_Project'
print("✅ Environment Ready.")

📦 Installing Libraries...
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.0/521.0 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, but you have transformers 5.0.0.dev0 which is incompatible.[0m[31m
[0mMounted at /content/drive
✅ Environment Ready.


In [None]:
# ==========================================
# CELL 2: DATA PREPARATION (COPY & IMPORT)
# ==========================================
import os
import glob
import zipfile
import shutil
import sys
from google.colab import drive

# 1. MOUNT DRIVE
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')

# 2. COPY THE DATASET READER
project_path = '/content/drive/MyDrive/AdMIRe_Project'
script_source = os.path.join(project_path, "admire_dataset.py")

if os.path.exists(script_source):
    shutil.copy(script_source, ".")
    print(f"✅ Copied admire_dataset.py to local runtime.")
else:
    # STRICT MODE: Stop if file is missing
    raise FileNotFoundError(f"❌ CRITICAL: 'admire_dataset.py' was not found in {project_path}. Please upload it to your Drive folder.")

# 3. IMPORT READER
from admire_dataset import AdMIReReader

# 4. PREPARE DATASETS (TRAIN & TEST)
# Ensure these match your zip names in Drive
train_zip = "AdMIRe Subtask A Train.zip"
test_zip = "AdMIRe Subtask A Test (Labelled).zip"

def unzip_if_needed(zip_name, target_dir):
    zip_path = f"{project_path}/{zip_name}"

    if not os.path.exists(zip_path):
        print(f"⚠️ Warning: {zip_name} not found in Drive. Skipping unzip.")
        return

    # Heuristic: Check if target folder is empty
    if os.path.exists(target_dir) and len(os.listdir(target_dir)) > 0:
        print(f"✅ {zip_name} seems already extracted.")
        return

    print(f"📦 Extracting {zip_name}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(target_dir)

unzip_if_needed(train_zip, "/content/admire_data/train_data")
unzip_if_needed(test_zip, "/content/admire_data/test_data")

# 5. INITIALIZE READERS
print("\n--- INITIALIZING READERS ---")

# Source Reader (Train) - Used for Few-Shot Examples
try:
    train_reader = AdMIReReader(data_root_path='/content/admire_data/train_data', split="Train", mode="qwen")
    print(f"✅ Source (Examples): Loaded {len(train_reader)} items.")
except Exception as e:
    print(f"❌ Error loading Train set: {e}")

# Target Reader (Test) - Used for Evaluation
try:
    try:
        test_reader = AdMIReReader(data_root_path='/content/admire_data/test_data', split="Test", mode="qwen")
    except:
        # Fallback if the TSV is named 'Dev' inside the zip
        test_reader = AdMIReReader(data_root_path='/content/admire_data/test_data', split="Dev", mode="qwen")
    print(f"✅ Target (Evaluation): Loaded {len(test_reader)} items.")
except Exception as e:
    print(f"❌ Error loading Test set: {e}")

✅ Copied admire_dataset.py to local runtime.
📦 Extracting AdMIRe Subtask A Train.zip...
📦 Extracting AdMIRe Subtask A Test (Labelled).zip...

--- INITIALIZING READERS ---
🕵️ Scanning /content/admire_data/train_data for Train TSV...
✅ Loaded TSV: /content/admire_data/train_data/train/subtask_a_train.tsv
✅ Images located at: /content/admire_data/train_data/train/
✅ Source (Examples): Loaded 70 items.
🕵️ Scanning /content/admire_data/test_data for Test TSV...
✅ Loaded TSV: /content/admire_data/test_data/test/subtask_a_test.tsv
✅ Images located at: /content/admire_data/test_data/test/
✅ Target (Evaluation): Loaded 15 items.


In [None]:
# ==========================================
# CELL 3: MODEL DEFINITION (FEW-SHOT)
# ==========================================
from transformers import AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig
from qwen_vl_utils import process_vision_info
import torch

class QwenFewShot:
    def __init__(self, model_name="qwen3"):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.hf_id = "Qwen/Qwen3-VL-4B-Instruct" if "qwen3" in model_name else "Qwen/Qwen2.5-VL-3B-Instruct"

        print(f"🤖 Loading {self.hf_id}...")
        bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)

        self.model = AutoModelForVision2Seq.from_pretrained(
            self.hf_id, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
        )
        self.processor = AutoProcessor.from_pretrained(self.hf_id, trust_remote_code=True)
        # Optimization
        self.processor.image_processor.max_pixels = 336 * 336
        self.processor.image_processor.min_pixels = 224 * 224

    def format_user_turn(self, sentence, images):
        """Helper to create the User content list"""
        content = []
        for i, img in enumerate(images):
            content.append({"type": "image", "image": img})
            content.append({"type": "text", "text": f"[Image {i+1}] "})

        prompt = (
            f"\nIdiom: \"{sentence}\"\n"
            "Task: Select the image that best represents the METAPHORICAL meaning.\n"
            "Steps: Define the abstract meaning, identify the matching image, and output the number.\n"
            "Final Answer Format: [[number]]"
        )
        content.append({"type": "text", "text": prompt})
        return content

    def predict_few_shot(self, target_sentence, target_images, examples):
        """
        examples: List of dicts {'text': str, 'images': [PIL], 'answer': int}
        """
        messages = []

        # 1. ADD EXAMPLES (The "Few-Shot" Context)
        for ex in examples:
            # User Turn (Example)
            messages.append({
                "role": "user",
                "content": self.format_user_turn(ex['text'], ex['images'])
            })

            # Assistant Turn (Ideal Response)
            # We hardcode a 'perfect' response for the example
            ans_idx = ex['answer'] + 1
            ideal_response = (
                f"The idiom \"{ex['text']}\" is abstract. "
                f"Image {ans_idx} depicts this metaphor correctly. "
                f"Final Answer: [[{ans_idx}]]"
            )
            messages.append({
                "role": "assistant",
                "content": [{"type": "text", "text": ideal_response}]
            })

        # 2. ADD TARGET (The actual question)
        messages.append({
            "role": "user",
            "content": self.format_user_turn(target_sentence, target_images)
        })

        # 3. GENERATE
        text = self.processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        image_inputs, video_inputs = process_vision_info(messages)

        inputs = self.processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            generated_ids = self.model.generate(**inputs, max_new_tokens=200)

        output = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return output.split("assistant\n")[-1].strip() if "assistant\n" in output else output

# Initialize
model = QwenFewShot(model_name="qwen3")

🤖 Loading Qwen/Qwen3-VL-4B-Instruct...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]



model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading (incomplete total...): 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading weights:   0%|          | 0/713 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

In [None]:
# ==========================================
# CELL 4: FEW-SHOT EXECUTION
# ==========================================
import time
import re
import random
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm

# Configuration
LIMIT_ITEMS = None
NUM_SHOTS = 1  # 1-Shot is usually best for speed/memory. 2-Shot might OOM on Colab.

# Helper to load images safely
def load_images_for_item(item_data):
    paths = item_data['image_paths']
    imgs = []
    for p in paths:
        if p != "MISSING" and os.path.exists(p):
            try: imgs.append(Image.open(p).convert("RGB"))
            except: imgs.append(Image.new('RGB', (224, 224), color='black'))
        else:
            imgs.append(Image.new('RGB', (224, 224), color='black'))
    return imgs

print(f"🚀 Starting {NUM_SHOTS}-Shot Evaluation on {len(test_reader)} items...")
results = []
report_data = []
save_path = f"{project_path}/results_qwen_fewshot.tsv"

# Pre-fetch Train indices for random sampling
train_indices = list(range(len(train_reader)))

count = 0
for item in tqdm(test_reader):
    if LIMIT_ITEMS and count >= LIMIT_ITEMS: break
    start_time = time.time()

    # 1. PREPARE EXAMPLES (Shots)
    example_data = []
    shot_indices = random.sample(train_indices, NUM_SHOTS)

    for shot_idx in shot_indices:
        ex_item = train_reader[shot_idx]
        ex_imgs = load_images_for_item(ex_item)
        example_data.append({
            'text': ex_item['text'],
            'images': ex_imgs,
            'answer': ex_item['label']
        })

    # 2. PREPARE TARGET
    target_imgs = load_images_for_item(item)

    # 3. PREDICT
    prediction_raw = "ERROR"
    top_pred_idx = -1

    try:
        prediction_raw = model.predict_few_shot(item['text'], target_imgs, example_data)

        # Parse
        match = re.search(r"\[\[(\d+)\]\]", prediction_raw)
        if match:
            top_pred_idx = int(match.group(1)) - 1
        else:
            # Fallback
            nums = re.findall(r'\d+', prediction_raw)
            top_pred_idx = int(nums[-1]) - 1 if nums else 0

    except Exception as e:
        print(f"❌ Error at item {count}: {e}")
        # If OOM happens, clear cache
        torch.cuda.empty_cache()

    # 4. REPORT
    is_correct = (top_pred_idx == item['label'])
    elapsed = time.time() - start_time

    report_data.append({
        "True_Answer": item['label'] + 1,
        "Model_Prediction": top_pred_idx + 1 if top_pred_idx != -1 else -1,
        "Result": "TRUE" if is_correct else "FALSE",
        "Time_Sec": round(elapsed, 2)
    })

    results.append({
        "text": item['text'],
        "prediction_raw": prediction_raw,
        "is_correct": is_correct
    })

    if count % 5 == 0:
        pd.DataFrame(results).to_csv(save_path, sep='\t', index=False)
    count += 1

# Final Stats
df_report = pd.DataFrame(report_data)
acc = (df_report["Result"] == "TRUE").mean() * 100
print(f"🏆 Final {NUM_SHOTS}-Shot Accuracy: {acc:.2f}%")
pd.DataFrame(results).to_csv(save_path, sep='\t', index=False)

🚀 Starting 1-Shot Evaluation on 15 items...


  0%|          | 0/15 [00:00<?, ?it/s]

🏆 Final 1-Shot Accuracy: 20.00%


In [None]:
# ==========================================
# CELL: INSPECT 1-SHOT RESULTS
# ==========================================
import pandas as pd
import os

# Define path (same as used in the script)
project_path = '/content/drive/MyDrive/AdMIRe_Project'
file_path = f"{project_path}/results_qwen_fewshot.tsv"

if os.path.exists(file_path):
    df = pd.read_csv(file_path, sep='\t')

    # Calculate Accuracy
    accuracy = (df["is_correct"] == True).mean() * 100

    print("="*40)
    print(f"📊 1-SHOT RESULTS ({len(df)} Items)")
    print(f"🏆 Accuracy: {accuracy:.2f}%")
    print("="*40)

    # Display the table
    # We select specific columns for a clean view
    print(df[['text', 'prediction_raw', 'is_correct']].to_string())
else:
    print(f"❌ File not found: {file_path}")
    print("Check if the previous step finished correctly.")

📊 1-SHOT RESULTS (15 Items)
🏆 Accuracy: 20.00%
                                                                                                                                                                                           text                                                                                                                                                                                                                                                                                                                                                                                         prediction_raw  is_correct
0                                                       The place got quite lively at one stage as a hen party moved in, with the bride-to-be in fancy dress with large balloons tied onto her.                                                                                                         The idiom refers to a lively, festive gathering, specifically 

In [None]:
# ==========================================
# CELL: PRINT CLEAN REPORT
# ==========================================
import pandas as pd
import os

# 1. Load the Results File
# Change this to 'results_qwen_train.tsv' or 'results_qwen_2shot.tsv' if needed
file_path = '/content/drive/MyDrive/AdMIRe_Project/results_qwen_fewshot.tsv'

if os.path.exists(file_path):
    df = pd.read_csv(file_path, sep='\t')

    # 2. Add 'Result' Column (TRUE/FALSE text)
    df['Result'] = df['is_correct'].apply(lambda x: "TRUE" if x else "FALSE")

    # 3. Add 'True_Answer' and 'Model_Prediction' if not explicitly there
    # (Assuming we need to reconstruct them from the raw data or if they were saved differently)
    # The previous script saved 'text', 'prediction_raw', 'is_correct'.
    # To get the exact numbers back for the table, we might need to re-parse or trust the saved log if we added those columns.

    # Let's create the exact view you asked for.
    # Note: If the file only saved 'is_correct', we can't fully reconstruct the numbers 1-5 easily
    # unless we saved them.

    # CHECK: Did the previous script save 'report_data' to CSV?
    # Actually, the previous script saved 'results' (text, prediction_raw, is_correct).
    # It displayed 'report_data' (numbers) but didn't save it to TSV.

    # SOLUTION: We will display the 'report_data' directly if you just ran the loop.
    # If you are loading from disk, we can only show what was saved.

    # Let's try to reconstruct the table from the detailed text if possible,
    # otherwise we display the simplified version.

    print("\n" + "="*40)
    print(f"📊 REPORT ({len(df)} Items)")
    print("="*40)

    # Rename columns to match your desired format
    # We will just show TRUE/FALSE and the Text for context since numbers weren't saved in the TSV
    display_df = df[['is_correct']].copy()
    display_df['Result'] = display_df['is_correct'].apply(lambda x: "TRUE" if x else "FALSE")
    display_df['Prediction_Snippet'] = df['prediction_raw'].str.slice(0, 50) + "..." # Show first 50 chars

    print(display_df[['Result', 'Prediction_Snippet']].to_string())
    print("="*40)

    acc = df['is_correct'].mean() * 100
    print(f"🏆 Final Accuracy: {acc:.2f}%")

else:
    print(f"❌ File not found: {file_path}")


📊 REPORT (15 Items)
   Result                                     Prediction_Snippet
0    TRUE  The idiom refers to a lively, festive gathering, s...
1    TRUE  The idiom "After all, they barely even have snail ...
2   FALSE  The idiom describes a golem with a heart of stone,...
3   FALSE  The idiom "caught red-handed" means being caught i...
4   FALSE  The idiom "cold feet" refers to hesitation or fear...
5    TRUE  The idiom refers to people who are inactive and se...
6   FALSE  The idiom refers to the metaphorical use of "peas ...
7   FALSE  The idiom "At first I thought that complaints that...
8   FALSE  The idiom describes a process of evaporation to le...
9   FALSE  The idiom "give the green light" is a metaphor for...
10  FALSE  The idiom refers to the atmosphere of a place wher...
11  FALSE  The idiom refers to a flying saucer as a metaphor ...
12  FALSE  The idiom "Man is produced by a direct creative ac...
13  FALSE  The idiom "Seasonal farm workers are primarily emp...
14  

In [None]:
# Run this IMMEDIATELY after the loop finishes (while variables are still in memory)
import pandas as pd

# 'report_data' variable contains the clean numbers from your last run
df_clean = pd.DataFrame(report_data)

print("\n" + "="*40)
print(f"📊 REPORT ({len(df_clean)} Items)")
print("="*40)
print(df_clean.to_string(index=True))
print("="*40)


📊 REPORT (15 Items)
    True_Answer  Model_Prediction Result  Time_Sec
0             4                 4   TRUE     53.83
1             2                 2   TRUE     46.99
2             3                 2  FALSE     43.96
3             3                 2  FALSE     44.17
4             4                 3  FALSE     41.87
5             1                 1   TRUE     42.06
6             3                 2  FALSE     41.41
7             5                 1  FALSE     42.27
8             2                 4  FALSE     41.68
9             4                 2  FALSE     41.32
10            2                 1  FALSE     40.58
11            5                 1  FALSE     41.12
12            2                 1  FALSE     43.99
13            2                 1  FALSE     41.13
14            5                 2  FALSE     41.27
