# VQA 

In [186]:
from glob import glob 
import pandas as pd
import os
from utils import load_data_to_dataframe 
import sys 
sys.path.append('/home/work/yuna/VLMEval/analysis/utils')
from question_type_mapper import question_type 

print('Total # of unique VQA question types:', len(set(question_type.values())) )# total unique question type  

## load all VQA dataset the json files 
vqa = pd.read_json("/home/work/yuna/HPA/data/vqav2_1k/val.json", lines=False)
df = pd.read_csv('/home/work/yuna/HPA/eda/full_qs_s1.csv') 
vqa_qids=df.dropna(subset=['question_type']).qid.unique()
vqa_unsampled = vqa[~vqa['question_id'].isin(vqa_qids)]
print('total vqa questions', len(vqa_qids))

Total # of unique VQA question types: 65
total vqa questions 277


In [209]:
len(vqa_unsampled)

723

## Preprocessing 

In [228]:
def translate_question(client, question):
    prompt = f"""
            You are a precise translation and data-formatting assistant.

            Task:
            Given a multiple-choice question and its options in English, 
            return the result as a strict JSON dictionary with the following fields:
            - question_kr: Translate the QUESTION into Korean 
            
            Input:
            QUESTION: {question}
            """

    response = client.chat.completions.create(
        model="gpt-5-nano",   # or your preferred model
        messages=[{"role": "user", "content": prompt}],
        temperature=1,
    )

    output = response.choices[0].message.content.strip()
    return output


In [234]:
import json 
from openai import OpenAI
from sklearn.model_selection import train_test_split 
from utils import translate #,translate_question
from dotenv import load_dotenv

load_dotenv()
OPENAI_API_KEY = os.getenv('API_KEY')
client = OpenAI(api_key=OPENAI_API_KEY) # os.getenv("OPENAI_API_KEY"))

# Sample new questions 
def sample_new_qs(df, num_sample=0, types=['category', 'l2_category'], answer_type='choice', preprocess=True, filename=None):  

    # preprocessing 
    df = df.rename(columns={"Unnamed: 0": 'qid', 'question_id': 'qid', 'question': 'question_en'}).dropna(axis=0) 
    df['qid'] = df['qid'].astype('Int64') 
    df['answer_type'] = answer_type 

    ## split options from questions 
    if answer_type == 'choice' and 'options' not in df.columns:     
        df[['question_en', 'options']] = df['question_en'].str.split('Options:', n=1, expand=True)

    if num_sample > 0 : ### sample new quetsions 
        print(f"sampling new #{num_sample} from {types}")

        df, _ = train_test_split(
            df,
            train_size=num_sample, 
            # stratify=df[types], 
            random_state=42
        )  

    if preprocess: 

        df['question_kr'] = None 

        for idx, row in df.iterrows():
            question = row["question_en"]
            if 'options' in row.keys(): 
                options = row["options"]
                output = translate(client, question, options) 
            else: 
                output = translate_question(client, question) 
            try:
                parsed = json.loads(output)
            except json.JSONDecodeError:
                print("JSON decode error:", output)
                parsed = None
            print(parsed)
            df.at[idx, 'question_kr'] = parsed['question_kr']
            if 'options' in row.keys(): 
                df.at[idx, 'options'] = parsed['options']

    if filename is not None: 
        df.to_csv(f'./questions/processed/{filename}_{answer_type}.csv')
        print(f'saved to ./questions/processed/{filename}_{answer_type}.csv') 
    
    # print(len(v2.columns),sorted( v2.columns))

    return df 


## Check distribution 

In [187]:
#get distributions of question type data from vqa 
sd = df.groupby(['question_type'])['qid'].count().reset_index().set_index('question_type')['qid'].to_dict() # subset 
fd = vqa.groupby(['question_type']).count().reset_index().set_index('question_type')['question_id'].to_dict() #fullset 

question_distribution = vqa['question_type'].value_counts()
question_distribution

dist = []
for f,v in fd.items() : 
    if f not in sd.keys(): 
        sd[f] = 0
    s = sd[f]
    # print(f, s/v)
    dist.append({
        '#subset': s, 
        '#total': v, 
        'done': s/v, 
        'question_type': f
    })
d = pd.DataFrame(dist).sort_values(by=['done', '#total'], ascending=False)
d

In [None]:
### Samplling 
# qtypes_to_sample = d[d['done'] ==0].question_type.values
qtypes_to_sample = d[d['done'] < 0.4].question_type.values
print(qtypes_to_sample)
vqa_tosample=vqa_unsampled[vqa_unsampled['question_type'].isin(qtypes_to_sample)]

In [267]:
vqa = pd.read_csv('/home/work/yuna/HPA/eda/questions/processed/vqa_v1_text.csv').drop_duplicates('qid')
# sampled_vqa = sample_new_qs(vqa_tosample, num_sample=100, types=['question_type'], answer_type='text', preprocess=True, filename=None)
vqa = pd.concat([vqa, sampled_vqa]) 
len(vqa)

277

## concat datasets 

In [None]:
import pandas as pd 
from glob import glob 

dfs=[]
for qs in glob("/home/work/yuna/HPA/eda/questions/processed/*.csv"):
    df = pd.read_csv(qs).rename(columns={'question_id':  "qid"}).drop_duplicates(subset=['qid']) 
    # print(df.columns)
    dfs.append(df) 
df = pd.concat(dfs)
df[['qid', 'category', 'l2_category', 'question_en', 'options','answer_type', 'question_kr', 'index','question_type']]
print(len(df[~(df['category'] == 'science & technology')]))
df = df[~(df['category'] == 'science & technology')]
df.to_csv('./full_qs_s1.csv')

# MMStar 

In [247]:
### ALL MMStar questions 
mmstar = "/home/work/yuna/HPA/eda/questions/raw/ACL_mmstar_questions.xlsx"
pd.ExcelFile(mmstar).sheet_names 

['s1', 'others']

In [None]:
# Chosen MMStar old questions 
df = pd.read_excel(mmstar, header=None, sheet_name='s1') 
df.columns = ['category', 'l2_category', 'question_id', 'question_en', 'question_kr', '', '', '', '', '', '']
df[['question_en', 'options']] = df['question_en'].str.split('Options:', n=1, expand=True)
df = df.dropna(axis=0) 
df['answer_type'] = 'choice' 
df = df.drop(columns=['']) # .to_csv('./questions/mmstar_v1_choice.csv')
len(df.columns), sorted(df.columns)
df['question_id'] = df['question_id'].astype(int)
s1 = df['question_id'].unique()


In [277]:
# Chosen MMStar old questions 
unsampled_mmstar = pd.read_excel(mmstar, sheet_name='others') 
unsampled_mmstar.columns = ['question_id' ,'category', 'l2_category', 'question', 'answer', 'score']
unsampled_mmstar[['question_en', 'options']] = unsampled_mmstar['question'].str.split('Options:', n=1, expand=True)
unsampled_mmstar = unsampled_mmstar.dropna(axis=0) 
unsampled_mmstar['answer_type'] = 'choice' 
unsampled_mmstar['question_id'] = unsampled_mmstar['question_id'].astype(int) 
# df = df.drop(columns=['']) # .to_csv('./questions/mmstar_v1_choice.csv')
len(unsampled_mmstar.columns), sorted(unsampled_mmstar.columns)

(9,
 ['answer',
  'answer_type',
  'category',
  'l2_category',
  'options',
  'question',
  'question_en',
  'question_id',
  'score'])

In [None]:
### sample new mmstar questions 
mm = sample_new_qs(df[~df['question_id'].isin(s1_2)], num_sample=100, types=['category', 'l2_category'], answer_type='choice', preprocess=True, filename=None) 

In [None]:
mm = pd.read_csv('./questions/processed/mmstar_translated_s1_2.csv').dropna(axis=1)
mm = pd.concat([mm, pd.read_csv('/home/work/yuna/HPA/eda/questions/processed/mmstar_translated_s3.csv')])
# mm[columns].to_csv('./questions/processed/mmstar_translated_s1_2.csv')
# mm[columns].to_csv('./questions/processed/mmstar_translated_s1_2.csv', index=False)

# filter categories 
mm = mm[~mm['category'].isin(['math', 'science & technology'])]
mmqids=mm.qid.unique()
print(len(mm), len(mmqids))

unsampled_mmstar = unsampled_mmstar[~unsampled_mmstar['category'].isin(['math', 'science & technology'])]
unsampled_mmstar = unsampled_mmstar[~unsampled_mmstar['question_id'].isin(mmqids)] 
len(unsampled_mmstar)

mm.groupby(['category']).count()

297


Unnamed: 0_level_0,qid,l2_category,question_en,options,answer_type,question_kr,Unnamed: 0.1,Unnamed: 0
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
coarse perception,62,62,62,62,62,62,18,18
fine-grained perception,88,88,88,88,88,88,19,19
instance reasoning,54,54,54,54,54,54,21,21
logical reasoning,36,36,36,36,36,36,14,14
math,9,9,9,9,9,9,4,4
science & technology,48,48,48,48,48,48,24,24


In [314]:
cols = new_mmstar.columns.tolist()

# find all positions where column == 'question_en'
positions = [i for i, c in enumerate(cols) if c == 'question_en']

# rename the second one
if len(positions) > 1:
    cols[positions[1]] = 'question_en_2'

new_mmstar.columns = cols

In [319]:
new_mmstar = new_mmstar.drop(columns=['question_en']).rename(columns={"question_en_2": "question_en"})

In [323]:
len(mm)

300

In [327]:
pd.concat([mm,vqa]).to_csv('s1.csv')

In [332]:
allqs = pd.read_csv('./s1.csv')
len(allqs)

677

In [330]:
len(allqs.dropna(axis=0, subset=['question_en', 'question_kr']))

677

In [325]:
len(vqa)

377

In [322]:
mm = pd.concat([mm.reset_index(drop=True), new_mmstar.reset_index(drop=True)], ignore_index=True)

In [306]:
new_mmstar.qid.values.isin(mm.qid.values).any()

False

In [297]:
new_mmstar = sample_new_qs(unsampled_mmstar, num_sample=60, types=['category', 'l2_category'], answer_type='choice', preprocess=True, filename=None) 

sampling new #60 from ['category', 'l2_category']
{
  "question_kr": "이미지의 주요 주제는 무엇입니까?",
  "options": " A: A woman surfing 여성이 서핑하는 것, B: A man skiting 남자가 스케이팅을 하는 것, C: A man surfing 남자가 서핑하는 것, D: A woman skiting 여자가 스케이팅을 하는 것"
}
{'question_kr': '이미지의 주요 주제는 무엇입니까?', 'options': ' A: A woman surfing 여성이 서핑하는 것, B: A man skiting 남자가 스케이팅을 하는 것, C: A man surfing 남자가 서핑하는 것, D: A woman skiting 여자가 스케이팅을 하는 것'}
{
  "question_kr": "이 네 객체가 공통으로 가지는 속성은 무엇입니까?",
  "options": "A: sticky 끈적한, B: hard 딱딱한, C: stretchy 신축성 있는, D: nan 숫자가 아님"
}
{'question_kr': '이 네 객체가 공통으로 가지는 속성은 무엇입니까?', 'options': 'A: sticky 끈적한, B: hard 딱딱한, C: stretchy 신축성 있는, D: nan 숫자가 아님'}
{
  "question_kr": "이 이미지는 어떤 분위기를 전달하나요?",
  "options": "A: Cozy 아늑한, B: Anxious 불안한, C: Happy 행복한, D: Angry 화난"
}
{'question_kr': '이 이미지는 어떤 분위기를 전달하나요?', 'options': 'A: Cozy 아늑한, B: Anxious 불안한, C: Happy 행복한, D: Angry 화난'}
{
  "question_kr": "강조된 대륙은 어느 대륙입니까?",
  "options": "A: Europe, B: South America, C: Antarctica, D: Afric

In [14]:
mm = sample_new_qs(mm, num_sample=100, types=['category', 'l2_category'], answer_type='choice', preprocess=True, filename=None) 

Unnamed: 0.1,Unnamed: 0,qid,category,l2_category,qid.1,question_en,options,answer_type,question_kr
0,0,0,instance reasoning,single-instance reasoning,664,Where is the woman's blue bag located in the i...,"A: In her hand 그녀의 손에, B: On her shoulder 그녀의 ...",choice,이미지 속 여성의 파란 가방은 어디에 위치해 있나요?
1,1,1,coarse perception,image emotion,6,What feeling is shown in this image?\n,"A: engaged 참여하는, B: lonely 외로운, C: angry 화난, D...",choice,이 이미지에 어떤 감정이 나타나고 있나요?
2,2,2,instance reasoning,single-instance reasoning,662,Where is the rug in the living room located?\n,"A: Next to the door 문 옆에, B: Under the table 탁...",choice,거실의 러그는 어디에 위치해 있나요?
3,3,3,logical reasoning,common reasoning,773,Is there a teddy bear in the image?\n,"A: Can't tell 알 수 없음, B: No 아니오, C: Yes 네, D: ...",choice,이미지에 곰인형이 있나요?
4,4,4,instance reasoning,cross-instance relation reasoning,556,What is the relationship between the people in...,"A: commercial 상업적인, B: professional 전문적인, C: f...",choice,사진 속 사람들 간의 관계는 무엇입니까?
...,...,...,...,...,...,...,...,...,...
194,194,844,coarse perception,image scene and topic,92,What is the overall theme of the image?\n,"A: Beach vacation 해변 휴가, B: Athletic lifestyle...",choice,그 이미지를 전반적으로 나타내는 주제는 무엇입니까?
195,195,558,fine-grained perception,recognition,378,What is the color of the ears on the dessert i...,"A: Red, B: Brown, C: Black, D: White A: 빨간색, B...",choice,이미지의 오른쪽 아래에 위치한 디저트 아이템의 귀 색은 무엇입니까?
196,196,812,fine-grained perception,recognition,410,What is the main item on the counter in the im...,A: A cup of ice cream on a counter with a cone...,choice,사진 속 카운터에 있는 주요 아이템은 무엇입니까?
197,197,709,logical reasoning,common reasoning,814,Which corner doesn't have any food?\n,"A: top-right 오른쪽 위, B: top-left 왼쪽 위, C: botto...",choice,어떤 모서리에는 음식이 없나요?


In [61]:
mm = sample_new_qs(mm, num_sample=0, types=['category', 'l2_category'], answer_type='choice', preprocess=True, filename=None) 

{
  "question_kr": "이미지 속 여성의 파란 가방은 어디에 위치해 있나요?",
  "options": "A: In her hand 그녀의 손에, B: On her shoulder 그녀의 어깨에, C: On the ground 땅에, D: Inside the man's bag 그 남자의 가방 안에"
}
{
  "question_kr": "이 이미지에 어떤 감정이 나타나고 있나요?",
  "options": "A: engaged 참여하는, B: lonely 외로운, C: angry 화난, D: supportive 지지하는"
}
{
  "question_kr": "거실의 러그는 어디에 위치해 있나요?",
  "options": "A: Next to the door 문 옆에, B: Under the table 탁자 아래에, C: In front of the window 창문 앞에, D: Under the couch 소파 아래에"
}
{
  "question_kr": "이미지에 곰인형이 있나요?",
  "options": "A: Can't tell 알 수 없음, B: No 아니오, C: Yes 네, D: Maybe 아마도"
}
{
  "question_kr": "사진 속 사람들 간의 관계는 무엇입니까?",
  "options": "A: commercial 상업적인, B: professional 전문적인, C: friends 친구들, D: family 가족"
}
{
  "question_kr": "사진에서 아기가 어느 방향을 향하고 있나요?",
  "options": "A: left 왼쪽, B: right 오른쪽, C: up 위, D: down 아래"
}
{
  "question_kr": "다음에 무슨 일이 일어날까요?",
  "options": "A: the kid is gonna slide through 그 아이가 미끄러져 지나갈 거야, B: the kid is gonna crash into the other kid 그 아이가 다른 아이와 부딪힐 거야,

In [19]:
# not chosen questions MMSTAR 
new_mmstar = pd.read_excel(mmstar, sheet_name='others')  
sample_new_qs(new_mmstar, 200)

Unnamed: 0.1,Unnamed: 0,category,l2_category,question_id,question_en,question_kr,options,answer_type,question,answer,correct
0,0,instance reasoning,single-instance reasoning,664.0,Where is the woman's blue bag located in the i...,"이미지에서 여자의 파란 가방은 어디에 있나요? A: 그녀의 손에, B: 어깨에, C...","A: In her hand, B: On her shoulder, C: On the...",choice,,,
1,1,coarse perception,image emotion,6.0,What feeling is shown in this image?\n,"이 이미지에는 어떤 느낌이 담겨 있나요? A: 참여, B: 외로운, C: 화난, D...","A: engaged, B: lonely, C: angry, D: supportive",choice,,,
2,2,instance reasoning,single-instance reasoning,662.0,Where is the rug in the living room located?\n,"거실의 러그는 어디에 있나요?\nA: 문 옆, B: 테이블 아래, C: 창문 앞, ...","A: Next to the door, B: Under the table, C: I...",choice,,,
3,3,logical reasoning,common reasoning,773.0,Is there a teddy bear in the image?\n,"이미지에 곰 인형이 있나요?\n옵션: A: 알 수 없습니다, B: 아니요, C: 네...","A: Can't tell, B: No, C: Yes, D: Maybe",choice,,,
4,4,instance reasoning,cross-instance relation reasoning,556.0,What is the relationship between the people in...,"이미지 속 사람들 사이의 관계는 무엇인가요? A: 광고, B: 전문가, C: 친구,...","A: commercial, B: professional, C: friends, D...",choice,,,
...,...,...,...,...,...,...,...,...,...,...,...
194,844,coarse perception,image scene and topic,92.0,What is the overall theme of the image?\n,,"A: Beach vacation, B: Athletic lifestyle, C: ...",choice,What is the overall theme of the image?\n Opti...,C,0.153846
195,558,fine-grained perception,recognition,378.0,What is the color of the ears on the dessert i...,,"A: Red, B: Brown, C: Black, D: White",choice,What is the color of the ears on the dessert i...,D,0.250000
196,812,fine-grained perception,recognition,410.0,What is the main item on the counter in the im...,,A: A cup of ice cream on a counter with a con...,choice,What is the main item on the counter in the im...,B,0.166667
197,709,logical reasoning,common reasoning,814.0,Which corner doesn't have any food?\n,,"A: top-right, B: top-left, C: bottom-left, D:...",choice,Which corner doesn't have any food?\n Options:...,D,0.181818


In [97]:
df = pd.concat(dfs)
df = df.dropna(axis=1)
df['question_id'] = pd.to_numeric(df['question_id'], errors='coerce')
# df['question_id']=df['question_id'].astype('int')
df['question_type'] = df['question_id'].map(question_type)
df[['question_en', 'question_kr']] = df['question'].str.split(
    pat='\n', 
    n=1,         # Only split on the FIRST newline found
    expand=True  # Expand the list into separate columns
)
# df['question_id'] = df['question_id'].t(int)
df.dropna(axis=0, inplace=True)
df['answer_type'] = 'text'
df.drop(columns=['question']).reset_index().to_csv('./questions/vqa_v1_text.csv') 

In [40]:
qids = df['question_id'].unique()
len(qids) 

278

In [None]:
qtypes = df.groupby(['question_type']).count().sort_values(by=['question_id']).reset_index().question_type.unique()
qtype 

array(['is the person', 'has', 'was', 'how many people are',
       'what are the', 'what sport is', 'which', 'what is the man',
       'what kind of', 'is it', 'what is', 'what time', 'is the man',
       'can you', 'are the', 'do you', 'does this', 'do', 'is that a',
       'none of the above', 'what color', 'why', 'where is the', 'who is',
       'what color are the', 'what color is', 'what are', 'is he', 'is',
       'is there a', 'are they', 'are these', 'what is in the',
       'are there', 'does the', 'what is the', 'what type of', 'could',
       'are', 'how many', 'is there', 'are there any', 'what is on the',
       'what', 'what color is the', 'is this a', 'is this', 'is the'],
      dtype=object)