In [None]:
# !pip install datasets torch transformers evaluate Tiktoken accelerate==0.26.0

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting filelock (from datasets)
  Downloading filelock-3.18.0-py3-none-any.whl.metadata (2.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-19.0.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting aiohttp (from datasets)
  Downloading aiohttp-3.11.13-cp310-cp310-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hu

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import evaluate
from datasets import load_dataset

In [2]:
dataset = load_dataset("AI4Math/MathVista")

In [3]:
test_data = dataset["testmini"]

In [8]:
import torch
from transformers import AutoModelForVision2Seq, AutoTokenizer, BlipProcessor, pipeline

import torch
from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Check if MPS (Apple Silicon) is being used
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"

from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM, AutoModelForMaskedLM, AutoTokenizer

def load_model(model_name):
    """Loads the correct model type based on architecture."""
    
    if "bert" in model_name:  # For BERT models
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
    
    elif "t5" in model_name or "flan" in model_name:  # For Flan-T5 or T5 models
        model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float32 if device == "mps" else torch.float16,  # MPS fix
            offload_folder="offload_weights"
        )
    
    elif "longformer" in model_name:  # For Longformer models
        model = AutoModelForMaskedLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float32 if device == "mps" else torch.float16,
            offload_folder="offload_weights"
        )

    else:  # For causal LMs (GPT, OPT, Falcon)
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            device_map="auto",
            torch_dtype=torch.float32 if device == "mps" else torch.float16,
            offload_folder="offload_weights"
        )
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    return model, tokenizer


def evaluate_model(model_name, dataset, max_samples=100):
    """Evaluates a model using accuracy on the MathVista dataset."""
    
    # Load tokenizer and model
    if "blip" in model_name:
        processor = BlipProcessor.from_pretrained(model_name)
        model = AutoModelForVision2Seq.from_pretrained(model_name, device_map="auto")
        pipe = pipeline("image-to-text", model=model, tokenizer=processor.tokenizer)
    else:
        model, tokenizer = load_model(model_name)
        pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

    correct, total = 0, 0
    for sample in dataset.select(range(min(max_samples, len(dataset)))):
        question = sample["question"]
        image = sample["decoded_image"]  # Extract image
        answer = sample["answer"]

        # Ensure valid input
        if "blip" in model_name and image is not None:
            response = pipe(image)[0]["generated_text"]  # Get caption
        else:
            response = pipe(
                question,
                max_new_tokens=50,  
                pad_token_id=tokenizer.pad_token_id,
                truncation=True,
                do_sample=False
            )[0]["generated_text"]

        # Check if answer is correct
        if answer.strip().lower() in response.strip().lower():
            correct += 1
        total += 1

    # Compute accuracy
    acc = correct / total if total > 0 else 0
    return {"model": model_name, "accuracy": acc}

In [82]:
print(evaluate_model("google/flan-t5-small", test_data, max_samples=100))

Device set to use mps


{'model': 'google/flan-t5-small', 'accuracy': 0.16}


In [5]:
import torch
from transformers import BlipProcessor, BlipForQuestionAnswering

def evaluate_blip_vqa(dataset, max_samples=100):
    """Evaluates BLIP VQA model on MathVista dataset."""
    
    model_name = "Salesforce/blip-vqa-base"
    processor = BlipProcessor.from_pretrained(model_name)
    model = BlipForQuestionAnswering.from_pretrained(model_name).to("mps" if torch.backends.mps.is_available() else "cuda")

    correct, total = 0, 0
    for sample in dataset.select(range(min(max_samples, len(dataset)))):
        question = sample["question"]
        image = sample["decoded_image"]

        # Ensure image is valid
        if image is None:
            continue  

        # Process image + question
        inputs = processor(images=image, text=question, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs)
        response = processor.batch_decode(outputs, skip_special_tokens=True)[0]

        # Compare with expected answer
        answer = sample["answer"]
        if answer and answer.strip().lower() in response.strip().lower():
            correct += 1
        total += 1

    acc = correct / total if total > 0 else 0
    return {"model": model_name, "accuracy": acc}

# Run BLIP VQA Model
print(evaluate_blip_vqa(test_data, max_samples=100))

{'model': 'Salesforce/blip-vqa-base', 'accuracy': 0.13}


In [8]:
import torch
from transformers import ViltProcessor, ViltForQuestionAnswering
from PIL import Image

def evaluate_vilt_vqa_light(dataset, max_samples=100):
    """Evaluates ViLT VQA Base on MathVista dataset."""

    model_name = "dandelin/vilt-b32-finetuned-vqa"
    processor = ViltProcessor.from_pretrained(model_name)
    model = ViltForQuestionAnswering.from_pretrained(model_name).to(
        "mps" if torch.backends.mps.is_available() else "cuda"
    )

    correct, total = 0, 0
    for sample in dataset.select(range(min(max_samples, len(dataset)))):
        question = sample["question"]
        image = sample["decoded_image"]

        if image is None:
            continue  

        # Convert image to RGB format (fix channel dimension issue)
        if isinstance(image, Image.Image):  
            image = image.convert("RGB")

        # Process image + question with proper truncation
        inputs = processor(
            images=image,
            text=question,
            return_tensors="pt",
            truncation=True,   # Fix token length issue
            max_length=40,     # Set max sequence length to avoid overflow
            padding="max_length"
        ).to(model.device)

        outputs = model(**inputs)
        predicted_answer = processor.tokenizer.decode(outputs.logits.argmax(-1), skip_special_tokens=True)

        answer = sample["answer"]
        if answer and answer.strip().lower() in predicted_answer.strip().lower():
            correct += 1
        total += 1

    acc = correct / total if total > 0 else 0
    return {"model": model_name, "accuracy": acc}

# Run ViLT VQA Model
print(evaluate_vilt_vqa_light(test_data, max_samples=100))

{'model': 'dandelin/vilt-b32-finetuned-vqa', 'accuracy': 0.06}


In [16]:
!pip install transformers openxlab

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting openxlab
  Downloading openxlab-0.1.2-py3-none-any.whl.metadata (3.8 kB)
Collecting filelock (from transformers)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting oss2~=2.17.0 (from openxlab)
  Downloading oss2-2.17.0.tar.gz (259 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pytz~=2023.3 (from openxlab)
  Downloading pytz-2023.4-py2.py3-none-any.whl.metadata (22 kB)
Collecting requests (from transformers)
  Downloading requests-2.28.2-py3-none-any.whl.metadata (4.6 kB)
Collecting rich~=13.4.2 (from openxlab)
  Downloading rich-13.4.2-py3-none-any.whl.metadata (18 kB)
Collecting setuptools~=60.2.0 (from openxlab)
  Downloading setuptools-60.2.0-py3-none-any.whl.metadata (5.1 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.65.2-py3-none-any.whl.metadata (56 kB)
Collecting crcmod>=1.7 (from oss2~=2.17.0->openxlab)
  Downloading crcmod-1.7.tar.gz (89 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollec

In [None]:
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration

def evaluate_blip2_vqa(dataset, max_samples=100):
    """Evaluates BLIP-2 on MathVista dataset."""

    model_name = "Salesforce/blip2-flan-t5-xl"
    processor = Blip2Processor.from_pretrained(model_name)
    model = Blip2ForConditionalGeneration.from_pretrained(model_name, device_map="auto").to(
        "mps" if torch.backends.mps.is_available() else "cuda"
    )

    correct, total = 0, 0
    for sample in dataset.select(range(min(max_samples, len(dataset)))):
        question = sample["question"]
        image = sample["decoded_image"]

        if image is None:
            continue  

        # Process image + question
        inputs = processor(images=image, text=question, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs, max_new_tokens=50)
        response = processor.tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

        # Compare with expected answer
        answer = sample["answer"]
        if answer and answer.strip().lower() in response.strip().lower():
            correct += 1
        total += 1

    acc = correct / total if total > 0 else 0
    return {"model": model_name, "accuracy": acc}

# Run BLIP-2 VQA Model
print(evaluate_blip2_vqa(test_data, max_samples=100))

In [13]:
import torch
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration

def evaluate_instructblip_vqa(dataset, max_samples=100):
    """Evaluates InstructBLIP VQA model on the MathVista dataset."""

    model_name = "Salesforce/instructblip-flan-t5-xl"
    processor = InstructBlipProcessor.from_pretrained(model_name)
    model = InstructBlipForConditionalGeneration.from_pretrained(model_name).to(
        "mps" if torch.backends.mps.is_available() else "cuda"
    )

    correct, total = 0, 0
    for sample in dataset.select(range(min(max_samples, len(dataset)))):
        question = sample["question"]
        image = sample["decoded_image"]

        if image is None:
            continue

        inputs = processor(images=image, text=question, return_tensors="pt").to(model.device)
        outputs = model.generate(**inputs)
        predicted_answer = processor.decode(outputs[0], skip_special_tokens=True)

        answer = sample["answer"]
        if answer and answer.strip().lower() in predicted_answer.strip().lower():
            correct += 1
        total += 1

    acc = correct / total if total > 0 else 0
    return {"model": model_name, "accuracy": acc}

# Run InstructBLIP VQA Model
print(evaluate_instructblip_vqa(test_data, max_samples=100))

preprocessor_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/21.2k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/75.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.44k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.31k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/135k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [17]:
import torch
from transformers import QwenImageProcessor, QwenForVisionLanguageUnderstanding

def evaluate_qwen_vl(dataset, max_samples=100):
    """Evaluates Qwen-VL model on the MathVista dataset."""

    model_name = "Qwen/Qwen-VL"
    processor = QwenImageProcessor.from_pretrained(model_name)
    model = QwenForVisionLanguageUnderstanding.from_pretrained(model_name).to(
        "mps" if torch.backends.mps.is_available() else "cuda"
    )

    correct, total = 0, 0
    for sample in dataset.select(range(min(max_samples, len(dataset)))):
        question = sample["question"]
        image = sample["decoded_image"]

        if image is None:
            continue

        inputs = processor(images=image, text=question, return_tensors="pt").to(model.device)
        outputs = model(**inputs)
        predicted_answer = processor.decode(outputs.logits.argmax(-1), skip_special_tokens=True)

        answer = sample["answer"]
        if answer and answer.strip().lower() in predicted_answer.strip().lower():
            correct += 1
        total += 1

    acc = correct / total if total > 0 else 0
    return {"model": model_name, "accuracy": acc}

# Run Qwen-VL Model
print(evaluate_qwen_vl(test_data, max_samples=100))

ImportError: cannot import name 'QwenImageProcessor' from 'transformers' (/Users/swatisharma/.pyenv/versions/3.10.0/lib/python3.10/site-packages/transformers/__init__.py)