In [15]:
from datasets import load_dataset
from transformers import AutoTokenizer, VisualBertForMultipleChoice
from torch.utils.data import DataLoader
import torch
from PIL import Image
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel

# Load the A-OKVQA dataset
dataset = load_dataset("HuggingFaceM4/A-OKVQA")

# Extract the validation split
val_dataset = dataset["validation"]

# Initialize CLIP model to get visual embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Preprocessing function for images
def preprocess_image(image):
    # Convert the image to PIL format and preprocess using CLIPProcessor
    inputs = clip_processor(images=image, return_tensors="pt")
    return inputs["pixel_values"]

# Function to get text embeddings using tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")




In [None]:
from transformers import AutoTokenizer, VisualBertForMultipleChoice
import torch

def prepare_inputs(questions, choices, tokenizer, visual_embeds):
    """
    Prepare the model inputs for VisualBert with multiple choices.
    
    Args:
        questions (list of str): The questions.
        choices (list of list of str): The choices for each question.
        tokenizer (transformers.AutoTokenizer): The tokenizer.
        visual_embeds (torch.Tensor): The tensor of visual embeddings.
    
    Returns:
        dict: The tokenized inputs ready for the model.
    """
    # Tokenize each question with its respective choices
    inputs = tokenizer(text=questions, text_pair=choices, padding=True, truncation=True, return_tensors="pt", max_length=128)
    
    # Assuming visual_embeds shape is (batch_size, num_visual_features, visual_embedding_dim)
    visual_embeds = visual_embeds.unsqueeze(1).expand(-1, len(choices[0]), -1, -1).contiguous()

    # Add visual embeddings and their corresponding masks and token type ids
    inputs.update({
        "visual_embeds": visual_embeds,
        "visual_attention_mask": torch.ones_like(visual_embeds[:, :, :, 0]),  # Assuming all visual inputs are attended
        "visual_token_type_ids": torch.zeros_like(visual_embeds[:, :, :, 0])  # Assuming all visual tokens are of type zero
    })
    
    return inputs

# Function to get text embeddings using tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")

# Example data
questions = ["What is in the motorcyclist's mouth?"] * 4  # Assume batch size of 4
choices = [["Gum", "Cigarette", "Nothing", "Candy"]] * 4
visual_embeds = torch.rand(4, 2048)  # Example visual features

# Prepare inputs
inputs = prepare_inputs(questions, choices, tokenizer, visual_embeds)

# Run the model
outputs = model(**inputs)

In [None]:
image = val_dataset[0]["image"]
question = val_dataset[0]["question"]
choices = val_dataset[0]["choices"]
label = val_dataset[0]['choices'][val_dataset[0]['correct_choice_idx']]

# Get visual embeddings
visual_embeds = preprocess_image(image).squeeze(0)

visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

encoding = tokenizer([[question, question], choices[2:]], return_tensors="pt", padding=True)

inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()}
inputs_dict.update(
    {
        "visual_embeds": visual_embeds,
        "visual_attention_mask": visual_attention_mask,
        "visual_token_type_ids": visual_token_type_ids,
    }
)
model.eval()
with torch.no_grad():
        outputs = model(**inputs_dict)

# Get the predicted answer index (use argmax)
predicted_idx = torch.argmax(outputs.logits, dim=-1).item()

# Check if the prediction matches the ground truth
if choices[2:][predicted_idx] == label:
    print("Correct!")


In [48]:
choices[2:][predicted_idx]

'popsicle stick'

In [47]:
predicted_idx

0

In [32]:
[question] * 4

["What is in the motorcyclist's mouth?",
 "What is in the motorcyclist's mouth?",
 "What is in the motorcyclist's mouth?",
 "What is in the motorcyclist's mouth?"]

In [30]:
choices

['toothpick', 'food', 'popsicle stick', 'cigarette']

In [26]:
question = val_dataset[0]["choices"]#[val_dataset[0]["correct_choice_idx"]]
question

['toothpick', 'food', 'popsicle stick', 'cigarette']

In [19]:
visual_embeds.expand(1, 4, *visual_embeds.shape).shape

torch.Size([1, 4, 3, 224, 224])

In [14]:
choices = val_dataset[0]["image"]
print(choices)

<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=640x569 at 0x1B0B2DA30>


In [7]:
from PIL import Image
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel

# Initialize CLIP model to get visual embeddings
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Preprocessing function for images
def preprocess_image(image):
    # Convert the image to PIL format and preprocess using CLIPProcessor
    image_pil = Image.fromarray(image)
    inputs = clip_processor(images=image_pil, return_tensors="pt")
    return inputs["pixel_values"]

# Function to get text embeddings using tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def prepare_batch(batch):
    # Get the question text and the image
    question = batch["question"]
    image = batch["image"]
    
    # Get visual embeddings
    visual_embeds = preprocess_image(image).squeeze(0)  # Remove batch dimension

    # Get text tokens
    inputs = tokenizer(question, return_tensors="pt", padding=True, truncation=True)

    # Add visual information
    inputs.update({
        "visual_embeds": visual_embeds.unsqueeze(0),  # Add batch dimension back
        "visual_token_type_ids": torch.ones(visual_embeds.shape[:-1], dtype=torch.long).unsqueeze(0),
        "visual_attention_mask": torch.ones(visual_embeds.shape[:-1], dtype=torch.float).unsqueeze(0),
    })
    
    return inputs

config.json:   0%|          | 0.00/4.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



In [None]:
# DataLoader with batch size 1 (to match the expected input size of VisualBERT)
val_dataloader = DataLoader(val_dataset, batch_size=1, collate_fn=prepare_batch)

# Load VisualBERT model for VQA
model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
model.eval()

correct = 0
total = 0

# Iterate through the validation set
for batch in val_dataloader:
    # Forward pass through VisualBERT
    with torch.no_grad():
        outputs = model(**batch)

    # Get the predicted answer index (use argmax)
    predicted_idx = torch.argmax(outputs.logits, dim=-1).item()
    
    # Check if the prediction matches the ground truth
    if predicted_idx == batch["correct_choice_idx"]:
        correct += 1

    total += 1

# Calculate accuracy
accuracy = correct / total
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

In [2]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['image', 'question_id', 'question', 'choices', 'correct_choice_idx', 'direct_answers', 'difficult_direct_answer', 'rationales'],
        num_rows: 17056
    })
    validation: Dataset({
        features: ['image', 'question_id', 'question', 'choices', 'correct_choice_idx', 'direct_answers', 'difficult_direct_answer', 'rationales'],
        num_rows: 1145
    })
    test: Dataset({
        features: ['image', 'question_id', 'question', 'choices', 'correct_choice_idx', 'direct_answers', 'difficult_direct_answer', 'rationales'],
        num_rows: 6702
    })
})


In [7]:
ds["train"].features['image']

Image(mode=None, decode=True, id=None)

In [3]:
from datasets import load_dataset
from transformers import BertTokenizer, VisualBertForQuestionAnswering
from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from torch.utils.data import DataLoader, Dataset
from PIL import Image, ExifTags
import torch

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Load A-OKVQA dataset (only validation set)
dataset = load_dataset("HuggingFaceM4/A-OKVQA", split="validation[:200]")  # Limit to 200 samples