In [2]:
# =========================
# Cell 1: Setup and Installations
# =========================
import os

try:
    import bitsandbytes
    print("bitsandbytes is already installed.")
except ImportError:
    print("Installing bitsandbytes, accelerate, and dependencies...")
    !pip install -q -U bitsandbytes accelerate transformers kagglehub "pillow<12.0"
    print("Installation complete. If you see CUDA / bitsandbytes errors, restart the runtime.")

bitsandbytes is already installed.


In [3]:
# =========================
# Cell 2: Imports & Configuration
# =========================
import torch
from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
from transformers.image_utils import load_image
import kagglehub
import glob
import os

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {device}")


Running on device: cuda


In [4]:
# =========================
# Cell 3: Data Loading (FG-NET)
# =========================
print("Downloading/Loading FG-NET dataset...")
path = kagglehub.dataset_download("aiolapo/fgnet-dataset")

def get_image_path(pattern):
    matches = glob.glob(pattern, recursive=True)
    if not matches:
        raise FileNotFoundError(f"No match found for pattern: {pattern}")
    return matches[0]

age_paths = {
    'newborn':          get_image_path(f"{path}/**/080A00.JPG"),
    'older_infant':     get_image_path(f"{path}/**/080A01.JPG"),
    'toddler':          get_image_path(f"{path}/**/080A02.JPG"),
    'preschool_child':  get_image_path(f"{path}/**/080A04.JPG"),
    'schoolage_child':  get_image_path(f"{path}/**/080A07.JPG"),
}

print("Sample image paths:")
for k, v in age_paths.items():
    print(f"  {k}: {v}")


Downloading/Loading FG-NET dataset...
Using Colab cache for faster access to the 'fgnet-dataset' dataset.
Sample image paths:
  newborn: /kaggle/input/fgnet-dataset/FGNET/images/080A00.JPG
  older_infant: /kaggle/input/fgnet-dataset/FGNET/images/080A01.JPG
  toddler: /kaggle/input/fgnet-dataset/FGNET/images/080A02.JPG
  preschool_child: /kaggle/input/fgnet-dataset/FGNET/images/080A04.JPG
  schoolage_child: /kaggle/input/fgnet-dataset/FGNET/images/080A07.JPG


In [5]:
# =========================
# Cell 4: Define Questions
# =========================
questions_on_empiricism = [
    'Alex can see things with his eyes. When could Alex see with his eyes for the first time?',
    'When there is a sound close by, Alex can hear it. When could Alex hear sounds for the first time?',
    'When seeing a red flower and a blue flower, Alex can tell that they are different colors. Alex can tell colors apart. When could Alex tell colors apart for the first time?',
    'When there is a car approaching, Alex can tell that the car is getting closer. Alex can tell what is near and what is far. When could Alex tell near and far for the first time?',
    'When Alex sees someone hold an object and then drop it, Alex thinks the object will fall. Alex thinks objects will fall if we let go of them. When could Alex think that for the first time?',
    'If Alex sees a toy being hidden in a box, he will think the object is still there even though he can no longer see it. When could Alex think that for the first time?',
    'If Alex sees two cookies, one with 5 chocolate chips in it and one with 20 chocolate chips in it, he can tell which cookie has more chocolate chips without counting. When could Alex tell which has more for the first time?',
    'If Alex sees a turtle that is upside down and struggling to get on its feet, he thinks that he should help the turtle. Alex thinks that helping is the right thing to do. When could Alex think that for the first time?',
    'Alex can read books. When could Alex read for the first time?'
]
print(f"Loaded {len(questions_on_empiricism)} questions.")


Loaded 9 questions.


In [6]:
# =========================
# Cell 5: Model Initialization (Idefics2-8B)
# =========================
print("Loading Idefics2 model...")
model_id = "HuggingFaceM4/idefics2-8b"

processor = AutoProcessor.from_pretrained(model_id)

# 4-bit quantization to fit into consumer GPUs (e.g., Colab T4)
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# Attention implementation: flash_attention_2 only on newer GPUs (A100/H100)
if torch.cuda.is_available():
    major_cc, _ = torch.cuda.get_device_capability()
    attn_impl = "flash_attention_2" if major_cc >= 8 else "eager"
else:
    attn_impl = "eager"

print(f"Using attention implementation: {attn_impl}")

model = AutoModelForVision2Seq.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    _attn_implementation=attn_impl,
    torch_dtype=torch.float16,
    device_map="auto",  # let Accelerate place the model
)

model.eval()
print("Model loaded successfully.")


Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Model loaded successfully.


In [7]:
# =========================
# Cell 6: The Experiment Loop
# =========================
print(f"Starting Inference on {len(questions_on_empiricism)} questions...")
print("=" * 60)

# Load images as PIL objects
image1 = load_image(age_paths['newborn'])
image2 = load_image(age_paths['toddler'])
image3 = load_image(age_paths['schoolage_child'])

results = []

def extract_image_choice(response: str):
    """
    Heuristic to extract 'Image 1', 'Image 2', or 'Image 3' from Idefics2's text.
    """
    for choice in ["Image 1", "Image 2", "Image 3"]:
        if choice.lower() in response.lower():
            return choice
    # Fallback: just return last sentence-ish
    return response.strip().split("\n")[-1][:100]

for i, question in enumerate(questions_on_empiricism, start=1):
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "text",  "text": "User:"},
                {"type": "image"},
                {"type": "text",  "text": "Image 1 is a newborn."},
                {"type": "image"},
                {"type": "text",  "text": "Image 2 is a toddler."},
                {"type": "image"},
                {"type": "text",  "text": "Image 3 is a school-age child."},
                {
                    "type": "text",
                    "text": (
                        f"\n{question}\n"
                        "Which image best answers the question? "
                        "Reply with 'Image 1', 'Image 2', or 'Image 3'."
                    ),
                },
            ],
        }
    ]

    # Apply chat template
    text = processor.apply_chat_template(messages, add_generation_prompt=True)

    # IMPORTANT for Idefics2: list of images per example
    inputs = processor(
        text=[text],
        images=[[image1, image2, image3]],  # nested list: batch of size 1, 3 images
        return_tensors="pt",
        padding=True,
    )

    # Move tensors to GPU if needed
    if device == "cuda":
        inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=32,
            do_sample=False,
        )

    generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
    full_response = generated_texts[0]

    # Split off the assistant part if template kept "Assistant:"
    if "Assistant:" in full_response:
        assistant_part = full_response.split("Assistant:", maxsplit=1)[-1].strip()
    else:
        assistant_part = full_response.strip()

    answer = extract_image_choice(assistant_part)
    results.append(answer)

    print(f"Q{i}: {question}")
    print(f"Idefics2 raw response: {assistant_part}")
    print(f"Idefics2 parsed answer: {answer}")
    print("-" * 60)


Starting Inference on 9 questions...
Q1: Alex can see things with his eyes. When could Alex see with his eyes for the first time?
Idefics2 raw response: Image 1.
Idefics2 parsed answer: Image 1
------------------------------------------------------------
Q2: When there is a sound close by, Alex can hear it. When could Alex hear sounds for the first time?
Idefics2 raw response: Image 3.
Idefics2 parsed answer: Image 3
------------------------------------------------------------
Q3: When seeing a red flower and a blue flower, Alex can tell that they are different colors. Alex can tell colors apart. When could Alex tell colors apart for the first time?
Idefics2 raw response: Image 3.
Idefics2 parsed answer: Image 3
------------------------------------------------------------
Q4: When there is a car approaching, Alex can tell that the car is getting closer. Alex can tell what is near and what is far. When could Alex tell near and far for the first time?
Idefics2 raw response: Image 2.
Idef

In [8]:
# =========================
# Cell 7: Save/Export Data
# =========================
print("Raw Results for Plotting / Analysis:")
print(results)


Raw Results for Plotting / Analysis:
['Image 1', 'Image 3', 'Image 3', 'Image 2', 'Image 1', 'Image 1', 'Image 3', 'Image 3', 'Image 3']
