In [9]:
import os
import json

# Set paths
datasets_dir = '../../../datasets'
vqax_dir = os.path.join(datasets_dir, 'VQA-X')
train_dir = os.path.join(vqax_dir, 'vqaX_train.json')
test_dir = os.path.join(vqax_dir, 'vqaX_test.json')
val_dir = os.path.join(vqax_dir, 'vqaX_val.json')
with open(train_dir) as f:
    train_data = json.load(f)
with open(test_dir) as f:
    test_data = json.load(f)
with open(val_dir) as f:
    val_data = json.load(f)

In [4]:
print(json.dumps(test_data['262284001'],indent=2))

{
  "question": "What is this?",
  "answers": [
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 1
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 2
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 3
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 4
    },
    {
      "answer": "shower",
      "answer_confidence": "maybe",
      "answer_id": 5
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 6
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 7
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 8
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 9
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 10
    }
  ],
  "im

---

In [1]:
import json
import os
from collections import Counter

data_dir = '../../../datasets/VQA-X'

# Danh sách các file cần kiểm tra
files_to_check = ['vqaX_train_ggtrans.json', 'vqaX_test_ggtrans.json', 'vqaX_val_ggtrans.json']
def analyze_translations(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    total_items = len(data)
    empty_translations = Counter()
    error_count = 0
    
    for key, item in data.items():
        if item.get('question_vi_ggtrans', '') == '':
            empty_translations['question'] += 1
        if item.get('answer_vi_ggtrans', '') == '':
            empty_translations['answer'] += 1
        if not item.get('explanation_vi_ggtrans', []):
            empty_translations['explanation'] += 1
        elif '' in item['explanation_vi_ggtrans']:
            empty_translations['explanation'] += 1
        
        # Kiểm tra nếu có bất kỳ trường dịch nào trống
        if (item.get('question_vi_ggtrans', '') == '' or
            item.get('answer_vi_ggtrans', '') == '' or
            not item.get('explanation_vi_ggtrans', []) or 
            '' in item.get('explanation_vi_ggtrans', [])):
            error_count += 1
    
    return {
        'total_items': total_items,
        'empty_translations': dict(empty_translations),
        'error_count': error_count
    }

# Phân tích từng file
for file_name in files_to_check:
    file_path = os.path.join(data_dir, file_name)
    if os.path.exists(file_path):
        print(f"Analyzing {file_name}...")
        results = analyze_translations(file_path)
        
        print(f"Total items: {results['total_items']}")
        print("Empty translations:")
        for field, count in results['empty_translations'].items():
            print(f"  {field}: {count}")
        print(f"Total items with at least one empty translation: {results['error_count']}")
        # print(f"Percentage of items with errors: {results['error_count']/results['total_items']*100:.2f}%")
        print("\n")
    else:
        print(f"File not found: {file_name}")

Analyzing vqaX_train_ggtrans.json...
Total items: 29459
Empty translations:
Total items with at least one empty translation: 0


Analyzing vqaX_test_ggtrans.json...
Total items: 1968
Empty translations:
Total items with at least one empty translation: 0


Analyzing vqaX_val_ggtrans.json...
Total items: 1459
Empty translations:
Total items with at least one empty translation: 0




In [2]:
import json
import os

# Đường dẫn đến thư mục chứa các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'

# Danh sách các tập dữ liệu
datasets = ['train', 'val', 'test']

# Danh sách các nguồn dịch
translation_sources = ['vinai', 'gemini', 'ggtrans']

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def merge_translations(datasets_dir, datasets, translation_sources):
    for dataset in datasets:
        print(f"Processing {dataset} dataset...")
        
        # Load original data
        original_file = f'{datasets_dir}/vqaX_{dataset}.json'
        original_data = load_json(original_file)
        
        # Load translations
        translations = {}
        for source in translation_sources:
            translation_file = f'{datasets_dir}/vqaX_{dataset}_{source}.json'
            if os.path.exists(translation_file):
                translations[source] = load_json(translation_file)
            else:
                print(f"Warning: {translation_file} not found. Skipping this translation source.")
        
        # Merge translations
        merged_data = {}
        for key, item in original_data.items():
            merged_item = item.copy()
            for source in translation_sources:
                if source in translations and key in translations[source]:
                    translated_item = translations[source][key]
                    merged_item[f'question_vi_{source}'] = translated_item.get(f'question_vi_{source}')
                    merged_item[f'answer_vi_{source}'] = translated_item.get(f'answer_vi_{source}')
                    merged_item[f'explanation_vi_{source}'] = translated_item.get(f'explanation_vi_{source}')
            merged_data[key] = merged_item
        
        # Save merged data
        output_file = f'{datasets_dir}/vqaX_{dataset}_translated.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(merged_data, f, ensure_ascii=False, indent=2)
        print(f"Merged data saved to {output_file}")
        
        # Print some statistics
        print(f"Total items in original data: {len(original_data)}")
        print(f"Total items in merged data: {len(merged_data)}")
        for source in translation_sources:
            if source in translations:
                print(f"Items with {source} translation: {sum(1 for item in merged_data.values() if f'question_vi_{source}' in item)}")
        print("\n")

# Run the merging process
merge_translations(datasets_dir, datasets, translation_sources)

Processing train dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_train_translated.json
Total items in original data: 29459
Total items in merged data: 29459
Items with vinai translation: 29459
Items with gemini translation: 29459
Items with ggtrans translation: 29459


Processing val dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_val_translated.json
Total items in original data: 1459
Total items in merged data: 1459
Items with vinai translation: 1459
Items with gemini translation: 1459
Items with ggtrans translation: 1459


Processing test dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_test_translated.json
Total items in original data: 1968
Total items in merged data: 1968
Items with vinai translation: 1968
Items with gemini translation: 1968
Items with ggtrans translation: 1968


