In [1]:
import os
import json
import sys

# Set paths
datasets_dir = '../../../datasets'
vqax_dir = os.path.join(datasets_dir, 'VQA-X')
train_dir = os.path.join(vqax_dir, 'vqaX_train.json')
test_dir = os.path.join(vqax_dir, 'vqaX_test.json')
val_dir = os.path.join(vqax_dir, 'vqaX_val.json')

In [2]:
with open(train_dir) as f:
    train_data = json.load(f)
with open(test_dir) as f:
    test_data = json.load(f)
with open(val_dir) as f:
    val_data = json.load(f)

In [3]:
print(type(train_data))

<class 'dict'>


In [5]:
print(json.dumps(test_data['262284001'], indent=2))

{
  "question": "What is this?",
  "answers": [
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 1
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 2
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 3
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 4
    },
    {
      "answer": "shower",
      "answer_confidence": "maybe",
      "answer_id": 5
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 6
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 7
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 8
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 9
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 10
    }
  ],
  "im

In [8]:
print(json.dumps(train_data['262146001'], indent=2))

{
  "question": "What is the person doing?",
  "answers": [
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 1
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 2
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 3
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 4
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 5
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 6
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 7
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 8
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 9
    },
    {
      "answer": "skiing",
      "answer_confidence": "yes",
      "answer_id": 10
    }


In [6]:
# assert all train q-a pairs has 1 explanation

notthree = sum([1 for k, v in test_data.items() if len(v['explanation']) != 3])
print(f"Test data with not 3 explanations: {notthree}/{len(test_data)}")

notthree = sum([1 for k, v in val_data.items() if len(v['explanation']) != 3])
print(f"Val data with not 3 explanations: {notthree}/{len(val_data)}")

number_of_explanations_train = [len(v['explanation']) for k, v in train_data.items()]
for i in list(set(number_of_explanations_train)):
    print(f"Number of explanations in train data: {i}: {number_of_explanations_train.count(i)}")

Test data with not 3 explanations: 0/1968
Val data with not 3 explanations: 0/1459
Number of explanations in train data: 1: 27383
Number of explanations in train data: 2: 2075
Number of explanations in train data: 3: 1


In [40]:
# assert all answer of a question is the same
keys = list(train_data.keys())[:10]
for key in keys:
    answers = train_data[key]['answers']
    for i in range(0, len(answers)):
        print(f"{train_data[key]['question']}: {answers[i]['answer']} - {answers[i]['answer_confidence']}")


What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
What is the person doing?: skiing - yes
Is the window open?: yes - yes
Is the window open?: no - yes
Is the window open?: yes - yes
Is the window open?: no - maybe
Is the window open?: yes - yes
Is the window open?: yes - maybe
Is the window open?: yes - yes
Is the window open?: yes - maybe
Is the window open?: yes - yes
Is the window open?: yes - yes
What is this person doing?: skiing - yes
What is this person doing?: skiing - yes
What is this person doing?: skiing - yes
What is this person doing?: skiing - yes
What is this person doing?: snow ski - yes
What is this person doing?: skiing - yes
What is this person doing?: skiing - y

In [39]:
import json
import pandas as pd
from collections import Counter

def calculate_stats(data):
    image_ids = set()
    questions = []
    answers = []
    explanations = []
    explanation_words = []
    
    for item in data.values():
        image_ids.add(item['image_id'])
        questions.append(item['question'])
        answers.extend([a['answer'] for a in item['answers']])
        explanations.extend(item['explanation'])
        explanation_words.extend([word for exp in item['explanation'] for word in exp.split()])
    
    return {
        'Imgs': len(set(image_ids)),
        'Q/A Pairs': len(data),
        'Unique Q.': len(set(questions)),
        'Unique A.': len(set(answers)),
        'Expl.': len(explanations),
        'Avg. #w': sum(len(exp.split()) for exp in explanations) / len(explanations),
        'Expl.Vocab Size': len(set(explanation_words)),
        'Comple. Pairs': '-',  # We don't have this information
        'Visual Ann.': '-'  # We don't have this information
    }

# Calculate stats for each split
train_stats = calculate_stats(train_data)
val_stats = calculate_stats(val_data)
test_stats = calculate_stats(test_data)

# Calculate total
total_stats = {
    'Imgs': sum(s['Imgs'] for s in [train_stats, val_stats, test_stats]),
    'Q/A Pairs': sum(s['Q/A Pairs'] for s in [train_stats, val_stats, test_stats]),
    'Unique Q.': len(set([item['question'] for item in train_data.values()] +
                            [item['question'] for item in val_data.values()] +
                            [item['question'] for item in test_data.values()])),
    
    'Unique A.': len(set([a['answer'] for item in train_data.values() for a in item['answers']] +
                         [a['answer'] for item in val_data.values() for a in item['answers']] +
                         [a['answer'] for item in test_data.values() for a in item['answers']])),
    'Expl.': sum(s['Expl.'] for s in [train_stats, val_stats, test_stats]),
    'Avg. #w': sum(s['Avg. #w'] * s['Expl.'] for s in [train_stats, val_stats, test_stats]) / 
               sum(s['Expl.'] for s in [train_stats, val_stats, test_stats]),
    'Expl.Vocab Size': len(set([word for item in train_data.values() for exp in item['explanation'] for word in exp.split()] +
                               [word for item in val_data.values() for exp in item['explanation'] for word in exp.split()] +
                               [word for item in test_data.values() for exp in item['explanation'] for word in exp.split()])),
    'Comple. Pairs': '-',
    'Visual Ann.': '-'
}

# Create DataFrame
df = pd.DataFrame([train_stats, val_stats, test_stats, total_stats], 
                  index=['Train', 'Val', 'Test', 'Total'])

# Format the DataFrame
df['Avg. #w'] = df['Avg. #w'].round(2)
for col in ['Imgs', 'Q/A Pairs', 'Unique Q.', 'Unique A.', 'Expl.', 'Expl.Vocab Size']:
    df[col] = df[col].astype(int)

df

Unnamed: 0,Imgs,Q/A Pairs,Unique Q.,Unique A.,Expl.,Avg. #w,Expl.Vocab Size,Comple. Pairs,Visual Ann.
Train,24876,29459,12999,4298,31536,8.55,9360,-,-
Val,1431,1459,814,681,4377,8.89,3442,-,-
Test,1921,1968,902,784,5904,8.94,3819,-,-
Total,28228,32886,13987,4772,41817,8.64,10537,-,-


In [24]:
# total number of questions
print(len(train_data))
print(len(val_data))
print(len(test_data))

29459
1459
1968
