In [None]:
from datasets import load_dataset 
dataset = load_dataset("Lin-Chen/MMStar", split="val") 

Generating val split: 100%|██████████| 1500/1500 [00:00<00:00, 3166.54 examples/s]


NameError: name 'Image' is not defined

In [18]:
dataset

Dataset({
    features: ['index', 'question', 'image', 'answer', 'category', 'l2_category', 'meta_info'],
    num_rows: 1500
})

In [4]:
from PIL import Image 

# blank image 
image = Image.new('RGB', (224, 224), color=(0, 0, 0)) 
image.save('./blank_224.png')

In [20]:
dataset[0]

{'index': 0,
 'question': 'Which option describe the object relationship in the image correctly?\nOptions: A: The suitcase is on the book., B: The suitcase is beneath the cat., C: The suitcase is beneath the bed., D: The suitcase is beneath the book.',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x384>,
 'answer': 'A',
 'category': 'coarse perception',
 'l2_category': 'image scene and topic',
 'meta_info': {'source': 'MMBench',
  'split': 'val',
  'image_path': 'images/0.jpg'}}

In [30]:
# swift_dataset(dataset)
swift_dataset(dataset, 'vlm')

saved to /home/work/yuna/HPA/data/swift/vlm_mmstar_blind.jsonl


In [29]:
def swift_dataset(vqa_data, models='llm', blind=True): 

    output_path=f"/home/work/yuna/HPA/data/swift/{models}_mmstar"
    if blind: 
        output_path += "_blind"
    output_path += ".jsonl"

    with open(output_path, 'w', encoding='utf-8') as f:
        for item in vqa_data:
            question = item['question']
            if not models == 'llm':
                question = f"<image>\n{question}"
            
            item['conversation'] = [
                    {
                        "from": "human",
                        "value": question
                    },
                    # {
                    #     "from": "assistant",
                    #     "value": item['multiple_choice_answer']
                    # }
                ]
            if not models == 'llm':
                if blind: 
                    item['image'] = ["/home/work/yuna/HPA/data/blank_224.png"]
                else: 
                   item['image'] = [os.path.join(image_folder, item['image_id'])]
            else: 
               item.pop('image')
 
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    print(f"saved to {output_path}") 


In [17]:
modelscope_tsv(dataset, './mmstar_blind_224.tsv', image_folder='blind_224') 

Saved TSV with full VQA annotations to: ./mmstar_blind_224.tsv


{'index': 1500,
 'question': 'How many chirality centers does the following molecule have?',
 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=340x266>,
 'answer': 'A',
 'category': 'science & technology',
 'l2_category': 'biology & chemistry & physics',
 'meta_info': {'source': 'MMMU',
  'split': 'val',
  'image_path': 'images/1499.jpg'},
 'A': 'A',
 'B': 'B',
 'C': 'C',
 'D': 'D',
 'image_path': '/home/work/yuna/HPA/data/blind_224.png'}

In [13]:
import re

def extract_question_and_options(question_text):
    """Extract clean question and individual options from question text"""
    # Remove <image X> tokens
    question_clean = re.sub(r'<image\s+\d+>', '', question_text)
    
    # Remove \n characters
    question_clean = question_clean.replace('\n', ' ')
    
    # Split by "Options:" to separate question and options
    parts = question_clean.split('Options:', 1)
    question_only = parts[0].strip()
    
    # Initialize options
    options = {'A': '', 'B': '', 'C': '', 'D': ''}
    
    if len(parts) > 1:
        options_text = parts[1].strip()
        # Extract options using regex (handles "A:", "B:", etc.)
        option_pattern = r'([A-D]):\s*([^,]+?)(?=,\s*[A-D]:|$)'
        matches = re.findall(option_pattern, options_text)
        
        for letter, value in matches:
            # Clean up the value and skip 'nan'
            value = value.strip()
            if value.lower() != 'nan':
                options[letter] = value
    
    return question_only, options

In [16]:
import csv
import os 
import json 

def modelscope_tsv(items, output_path, image_folder='/home/work/yuna/VLMEval/data/val2014'):
    """
    Write a list of items to a TSV file with columns:
    index, answer, question, image_path

    index	category	answer	question	A	B	C	D	image_path
    1	Animals	A	What animal is this?	Dog	Cat	Tiger	Elephant	/root/LMUData/images/custom_mcq/dog.jpg
    2	Buildings	D	What building is this?	School	Hospital	Park	Museum	/root/LMUData/images/custom_mcq/AMNH.jpg
    3	Cities	B	Which city's skyline is this?	New York	Tokyo	Shanghai	Paris	/root/LMUData/images/custom_mcq/tokyo.jpg
    4	Vehicles	C	What is the brand of this car?	BMW	Audi	Tesla	Mercedes	/root/LMUData/images/custom_mcq/tesla.jpg
    5	Activities	A	What is the person in the picture doing?	Running	Swimming	Reading	Singing	/root/LMUData/images/custom_mcq/running.jpg

    Parameters:
        data: list of dicts, each containing:
              {
                  "answer": str,
                  "question": str,
                  "image_path": str
              }
        output_path: str, path to write .tsv file
    """
    with open(output_path, "w", encoding="utf-8", newline="") as f:
        if items:  # Check if items list is not empty
            # Get fieldnames from the first item after adding the new fields
            first_item = items[0].copy()
            first_item["index"] = 1
            if 'blind' in image_folder:
                first_item["image_path"] = f"/home/work/yuna/HPA/data/{image_folder}.png"
            else:
                first_item["image_path"] = os.path.join(image_folder, item["image_id"])
            
            choices = ['A', 'B', 'C', 'D'] 
            fieldnames = list(first_item.keys()) + choices 
            writer = csv.DictWriter(f, fieldnames=fieldnames, delimiter="\t")
            writer.writeheader()

            for idx, item in enumerate(items, start=1):
                item['question'], options = extract_question_and_options(item['question'])
                for option, choice in zip(options, choices): 
                    item[choice] = option 
                item["index"] = idx
                if 'blind' in image_folder:
                    item["image_path"] = f"/home/work/yuna/HPA/data/{image_folder}.png"
                else:
                    item["image_path"] = os.path.join(image_folder, item["image_id"])
                writer.writerow(item)
        
    print(f"Saved TSV with full VQA annotations to: {output_path}")
    return item 