In [9]:
import os
import json

# Set paths
datasets_dir = '../../../datasets'
vqax_dir = os.path.join(datasets_dir, 'VQA-X')
train_dir = os.path.join(vqax_dir, 'vqaX_train.json')
test_dir = os.path.join(vqax_dir, 'vqaX_test.json')
val_dir = os.path.join(vqax_dir, 'vqaX_val.json')
with open(train_dir) as f:
    train_data = json.load(f)
with open(test_dir) as f:
    test_data = json.load(f)
with open(val_dir) as f:
    val_data = json.load(f)

In [4]:
print(json.dumps(test_data['262284001'],indent=2))

{
  "question": "What is this?",
  "answers": [
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 1
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 2
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 3
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 4
    },
    {
      "answer": "shower",
      "answer_confidence": "maybe",
      "answer_id": 5
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 6
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 7
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 8
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 9
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 10
    }
  ],
  "im

---

## Verifying the translations

In [6]:
import json
import os

# Đường dẫn đến thư mục chứa các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'

# Danh sách các tập dữ liệu
datasets = ['train', 'val', 'test']

# Danh sách các nguồn dịch
translation_sources = ['vinai', 'gemini', 'ggtrans']

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def check_data_integrity(datasets_dir, datasets, translation_sources):
    for dataset in datasets:
        print(f"Checking {dataset} dataset...")
        
        # Load merged data
        merged_file = f'{datasets_dir}/vqaX_{dataset}_translated.json'
        merged_data = load_json(merged_file)
        
        error_items = {}
        
        for key, item in merged_data.items():
            item_errors = []
            
            # Check original fields
            if not item['question'] or not item['answers'] or not item['explanation']:
                item_errors.append("Missing original fields")
            
            # Check translated fields
            for source in translation_sources:
                question_key = f'question_vi_{source}'
                answer_key = f'answer_vi_{source}'
                explanation_key = f'explanation_vi_{source}'
                
                if question_key not in item or not item[question_key]:
                    item_errors.append(f"Missing or empty {question_key}")
                
                if answer_key not in item or not item[answer_key]:
                    item_errors.append(f"Missing or empty {answer_key}")
                
                if explanation_key not in item:
                    item_errors.append(f"Missing {explanation_key}")
                elif not item[explanation_key]:
                    item_errors.append(f"Empty {explanation_key}")
                elif len(item[explanation_key]) != len(item['explanation']):
                    item_errors.append(f"Mismatch in number of explanations for {explanation_key}")
                elif any(not exp for exp in item[explanation_key]):
                    item_errors.append(f"Empty explanation in {explanation_key}")
            
            if item_errors:
                error_items[key] = item_errors
        
        # Save error items
        error_file = f'{dataset}_errors.json'
        # with open(error_file, 'w', encoding='utf-8') as f:
        #     json.dump(error_items, f, ensure_ascii=False, indent=2)
        
        print(f"Total items: {len(merged_data)}")
        print(f"Items with errors: {len(error_items)}")
        print("\n")

# Run the integrity check
check_data_integrity(datasets_dir, datasets, translation_sources)

Checking train dataset...
Total items: 29459
Items with errors: 0


Checking val dataset...
Total items: 1459
Items with errors: 0


Checking test dataset...
Total items: 1968
Items with errors: 0




## Combine translations into one object

In [5]:
import json
import os

# Đường dẫn đến thư mục chứa các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'

# Danh sách các tập dữ liệu
datasets = ['train', 'val', 'test']

# Danh sách các nguồn dịch
translation_sources = ['vinai', 'gemini', 'ggtrans']

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def merge_translations(datasets_dir, datasets, translation_sources):
    for dataset in datasets:
        print(f"Processing {dataset} dataset...")
        
        # Load original data
        original_file = f'{datasets_dir}/vqaX_{dataset}.json'
        original_data = load_json(original_file)
        
        # Load translations
        translations = {}
        for source in translation_sources:
            translation_file = f'{datasets_dir}/vqaX_{dataset}_{source}.json'
            if os.path.exists(translation_file):
                translations[source] = load_json(translation_file)
            else:
                print(f"Warning: {translation_file} not found. Skipping this translation source.")
        
        # Merge translations
        merged_data = {}
        for key, item in original_data.items():
            merged_item = item.copy()
            for source in translation_sources:
                if source in translations and key in translations[source]:
                    translated_item = translations[source][key]
                    merged_item[f'question_vi_{source}'] = translated_item.get(f'question_vi_{source}')
                    merged_item[f'answer_vi_{source}'] = translated_item.get(f'answer_vi_{source}')
                    merged_item[f'explanation_vi_{source}'] = translated_item.get(f'explanation_vi_{source}')
            merged_data[key] = merged_item
        
        # Save merged data
        output_file = f'{datasets_dir}/vqaX_{dataset}_translated.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(merged_data, f, ensure_ascii=False, indent=2)
        print(f"Merged data saved to {output_file}")
        
        # Print some statistics
        print(f"Total items in original data: {len(original_data)}")
        print(f"Total items in merged data: {len(merged_data)}")
        for source in translation_sources:
            if source in translations:
                print(f"Items with {source} translation: {sum(1 for item in merged_data.values() if f'question_vi_{source}' in item)}")
        print("\n")

# Run the merging process
merge_translations(datasets_dir, datasets, translation_sources)

Processing train dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_train_translated.json
Total items in original data: 29459
Total items in merged data: 29459
Items with vinai translation: 29459
Items with gemini translation: 29459
Items with ggtrans translation: 29459


Processing val dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_val_translated.json
Total items in original data: 1459
Total items in merged data: 1459
Items with vinai translation: 1459
Items with gemini translation: 1459
Items with ggtrans translation: 1459


Processing test dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_test_translated.json
Total items in original data: 1968
Total items in merged data: 1968
Items with vinai translation: 1968
Items with gemini translation: 1968
Items with ggtrans translation: 1968




In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer_en2vi = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi-v2", src_lang="en_XX")
model_en2vi = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi-v2")
device_en2vi = torch.device("cuda")
model_en2vi.to(device_en2vi)

def translate_en2vi(en_texts):
    input_ids = tokenizer_en2vi(en_texts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device_en2vi)
    output_ids = model_en2vi.generate(
        **input_ids,
        decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
        num_return_sequences=1,
        num_beams=5,
        early_stopping=True
    )
    vi_texts = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return vi_texts



In [4]:
print(translate_en2vi(["bat"]))
print(translate_en2vi(["Question: What is the child holding? Answer: bat"]))

['dơi']
['Câu hỏi: Đứa trẻ đang cầm gì? Trả lời: dơi']


## Random 500 samples

In [9]:
import json
import pandas as pd
import random

# Đường dẫn đến file dữ liệu train
train_file = '../../../datasets/VQA-X/vqaX_train.json'

with open(train_file, 'r', encoding='utf-8') as f:
    train_data = json.load(f)

sample_keys = random.sample(list(train_data.keys()), 500)

# Tạo danh sách các mẫu
samples = []
for key in sample_keys:
    item = train_data[key]
    samples.append({
        'id': key,
        'question': item['question'],
        'question_vi': '',  # Cột để điền bản dịch
        'explanation': item['explanation'][0],
        'explanations_vi': ''  # Cột để điền bản dịch
    })

df = pd.DataFrame(samples)

output_file = 'sample_500_train.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Đã lưu 500 mẫu vào file {output_file}")

Đã lưu 500 mẫu vào file sample_500_train.csv


In [2]:
import json
import pandas as pd

# Đường dẫn đến các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'
dataset_files = ['vqaX_train_translated.json', 'vqaX_val_translated.json', 'vqaX_test_translated.json']

# Hàm để lấy tất cả các câu trả lời từ một file
def get_answers_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    answers = []
    for key, item in data.items():
        answers.append((key, item['answer_vi_gemini']))
    
    return answers

# Lấy tất cả các câu trả lời từ tất cả các file
all_answers = []
for file in dataset_files:
    file_path = f"{datasets_dir}/{file}"
    all_answers.extend(get_answers_from_file(file_path))

sorted_answers = sorted(all_answers, key=lambda x: x[1])
df = pd.DataFrame(sorted_answers, columns=['id', 'answer_vi_gemini'])
output_file = 'vqax_answer_gemini.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Đã lưu {len(sorted_answers)} câu trả lời duy nhất vào file {output_file}")

Đã lưu 32886 câu trả lời duy nhất vào file vqax_answer_gemini.csv
