In [1]:
import os
import json

# Set paths
datasets_dir = '../../../datasets'
vqax_dir = os.path.join(datasets_dir, 'VQA-X')
train_dir = os.path.join(vqax_dir, 'vqaX_train.json')
test_dir = os.path.join(vqax_dir, 'vqaX_test.json')
val_dir = os.path.join(vqax_dir, 'vqaX_val.json')
with open(train_dir) as f:
    train_data = json.load(f)
with open(test_dir) as f:
    test_data = json.load(f)
with open(val_dir) as f:
    val_data = json.load(f)

In [4]:
print(json.dumps(test_data['262284001'],indent=2))

{
  "question": "What is this?",
  "answers": [
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 1
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 2
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 3
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 4
    },
    {
      "answer": "shower",
      "answer_confidence": "maybe",
      "answer_id": 5
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 6
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 7
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 8
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 9
    },
    {
      "answer": "shower",
      "answer_confidence": "yes",
      "answer_id": 10
    }
  ],
  "im

## Combine translations into one object

## Verifying the translations

In [12]:
import json
import os

# Đường dẫn đến thư mục chứa các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'

# Danh sách các tập dữ liệu
datasets = ['train', 'val', 'test']

# Danh sách các nguồn dịch
translation_sources = ['vinai', 'gemini', 'ggtrans', 'gpt']

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def check_data_integrity(datasets_dir, datasets, translation_sources):
    for dataset in datasets:
        print(f"Checking {dataset} dataset...")
        
        # Load merged data
        merged_file = f'{datasets_dir}/vqaX_{dataset}_translated.json'
        merged_data = load_json(merged_file)
        
        error_items = {}
        
        for key, item in merged_data.items():
            item_errors = []
            
            # Check original fields
            if not item['question'] or not item['answers'] or not item['explanation']:
                item_errors.append("Missing original fields")
            
            # Check translated fields
            for source in translation_sources:
                question_key = f'question_vi_{source}'
                answer_key = f'answer_vi_{source}'
                explanation_key = f'explanation_vi_{source}'
                
                if question_key not in item or not item[question_key]:
                    item_errors.append(f"Missing or empty {question_key}")
                
                if answer_key not in item or not item[answer_key]:
                    item_errors.append(f"Missing or empty {answer_key}")
                
                if explanation_key not in item:
                    item_errors.append(f"Missing {explanation_key}")
                elif not item[explanation_key]:
                    item_errors.append(f"Empty {explanation_key}")
                elif len(item[explanation_key]) != len(item['explanation']):
                    item_errors.append(f"Mismatch in number of explanations for {explanation_key}")
                elif any(not exp for exp in item[explanation_key]):
                    item_errors.append(f"Empty explanation in {explanation_key}")
            
            if item_errors:
                error_items[key] = item_errors
        
        # Save error items
        error_file = f'{dataset}_errors.json'
        with open(error_file, 'w', encoding='utf-8') as f:
            json.dump(error_items, f, ensure_ascii=False, indent=2)
        
        print(f"Total items: {len(merged_data)}")
        print(f"Items with errors: {len(error_items)}")
        print("\n")

# Run the integrity check
check_data_integrity(datasets_dir, datasets, translation_sources)

Checking train dataset...
Total items: 29459
Items with errors: 0


Checking val dataset...
Total items: 1459
Items with errors: 0


Checking test dataset...
Total items: 1968
Items with errors: 0




In [11]:
import json
import os

# Đường dẫn đến thư mục chứa các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'

# Danh sách các tập dữ liệu
datasets = ['train', 'val', 'test']

# Danh sách các nguồn dịch
translation_sources = ['vinai', 'gemini', 'ggtrans', 'gpt']

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        return json.load(f)

def merge_translations(datasets_dir, datasets, translation_sources):
    for dataset in datasets:
        print(f"Processing {dataset} dataset...")
        
        # Load original data
        original_file = f'{datasets_dir}/vqaX_{dataset}.json'
        original_data = load_json(original_file)
        
        # Load translations
        translations = {}
        for source in translation_sources:
            translation_file = f'{datasets_dir}/vqaX_{dataset}_{source}.json'
            if os.path.exists(translation_file):
                translations[source] = load_json(translation_file)
            else:
                print(f"Warning: {translation_file} not found. Skipping this translation source.")
        
        # Merge translations
        merged_data = {}
        for key, item in original_data.items():
            merged_item = item.copy()
            for source in translation_sources:
                if source in translations and key in translations[source]:
                    translated_item = translations[source][key]
                    merged_item[f'question_vi_{source}'] = translated_item.get(f'question_vi_{source}')
                    merged_item[f'answer_vi_{source}'] = translated_item.get(f'answer_vi_{source}')
                    merged_item[f'explanation_vi_{source}'] = translated_item.get(f'explanation_vi_{source}')
            merged_data[key] = merged_item
        
        # Save merged data
        output_file = f'{datasets_dir}/vqaX_{dataset}_translated.json'
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(merged_data, f, ensure_ascii=False, indent=2)
        print(f"Merged data saved to {output_file}")
        
        # Print some statistics
        print(f"Total items in original data: {len(original_data)}")
        print(f"Total items in merged data: {len(merged_data)}")
        for source in translation_sources:
            if source in translations:
                print(f"Items with {source} translation: {sum(1 for item in merged_data.values() if f'question_vi_{source}' in item)}")
        print("\n")

# Run the merging process
merge_translations(datasets_dir, datasets, translation_sources)

Processing train dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_train_translated.json
Total items in original data: 29459
Total items in merged data: 29459
Items with vinai translation: 29459
Items with gemini translation: 29459
Items with ggtrans translation: 29459
Items with gpt translation: 29459


Processing val dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_val_translated.json
Total items in original data: 1459
Total items in merged data: 1459
Items with vinai translation: 1459
Items with gemini translation: 1459
Items with ggtrans translation: 1459
Items with gpt translation: 1459


Processing test dataset...
Merged data saved to ../../../datasets/VQA-X/vqaX_test_translated.json
Total items in original data: 1968
Total items in merged data: 1968
Items with vinai translation: 1968
Items with gemini translation: 1968
Items with ggtrans translation: 1968
Items with gpt translation: 1968




## Visualize translations

In [15]:
import os
import json
import pandas as pd
import numpy as np
import random
random.seed(0)
# Đường dẫn tới thư mục datasets và evaluation
datasets_dir = '../../../datasets/VQA-X'
with open(os.path.join(datasets_dir, 'vqaX_val_translated.json')) as f:
    val_data = json.load(f)

vote_dir = os.path.join(datasets_dir, 'data_csv_final_voting')
argmax_data = pd.read_csv(os.path.join(vote_dir, 'argmax.csv'))
sampling_data = pd.read_csv(os.path.join(vote_dir, 'sampling.csv'))
# id_image,question,answer,explanation

argmax_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id_image     1459 non-null   int64 
 1   question     1459 non-null   object
 2   answer       1459 non-null   object
 3   explanation  1459 non-null   object
dtypes: int64(1), object(3)
memory usage: 45.7+ KB


In [19]:
# random id_image in train_data
indices = random.sample(list(val_data.keys()), 100)
translation_sources = ['vinai', 'gemini', 'ggtrans', 'gpt']
# print original, argmax, sampling and translation
for idx in indices:
    print(f"ID: {idx}")
    print(f"explanation: {val_data[idx]['explanation']}")
    print(f"Argmax: {argmax_data[argmax_data['id_image'] == int(idx)]['explanation'].values[0]}")
    print(f"Sampling: {sampling_data[sampling_data['id_image'] == int(idx)]['explanation'].values[0]}")
    print(f"Translation: ")
    for source in translation_sources:
        print(f"  {source}: {val_data[idx][f'explanation_vi_{source}']}")
    print("=====================================")

ID: 262651002
explanation: ['this boat is a large barge', 'it is designed to move cargo', 'it is a large boat for cargo']
Argmax: chiếc thuyền này là một chiếc sà lan lớn; nó được thiết kế để di chuyển hàng hóa; nó là một chiếc thuyền lớn để chở hàng
Sampling: Thuyền này là một sà lan lớn.; nó được thiết kế để di chuyển hàng hóa; nó là một chiếc thuyền lớn để chở hàng
Translation: 
  vinai: ['Thuyền này là một sà lan lớn.', 'nó được thiết kế để di chuyển hàng hóa', 'nó là một chiếc thuyền lớn để chở hàng']
  gemini: ['chiếc thuyền này là một chiếc sà lan lớn', 'nó được thiết kế để di chuyển hàng hóa', 'đó là một chiếc thuyền lớn để chở hàng hóa']
  ggtrans: ['chiếc thuyền này là một chiếc sà lan lớn', 'nó được thiết kế để di chuyển hàng hóa', 'nó là một chiếc thuyền lớn để chở hàng']
  gpt: ['chiếc thuyền này là một chiếc xà lan lớn', 'nó được thiết kế để vận chuyển hàng hóa', 'nó là một chiếc thuyền lớn dành cho hàng hóa']
ID: 558673000
explanation: ['there is a lot of furniture in th

## Check the score of the translations

In [10]:
import os
import json
datasets_dir = '../../../datasets/VQA-X'
evaluation_dir = os.path.join(datasets_dir, 'evaluation')
train_file_path = os.path.join(evaluation_dir, 'train_gpt_evaluation.json')

# Hàm để sửa các object bị lỗi
def fix_explanation_scores(obj):
    explanation = obj['explanation']
    explanation_scores = obj["explanation_scores"]

    # Đảm bảo rằng explanation_scores có số lượng phần tử tương ứng với các phần tử trong explanation
    if len(explanation) == 1:
        if len(explanation_scores) < 4 and obj["explanation_scores"][0] != []:
            obj["explanation_scores"] = [obj["explanation_scores"][0]]
        elif len(explanation_scores) == 4:
            # print(obj["id_image"], obj["explanation_scores"])
            obj["explanation_scores"] = [[explan[0] for explan in obj["explanation_scores"]]]
    return obj

# Gọi hàm để sửa file
with open(train_file_path, 'r', encoding='utf-8') as f:
    train_data = json.load(f)
    for obj in train_data:
        fix_explanation_scores(obj)
        
with open(train_file_path, 'w', encoding='utf-8') as f:
    json.dump(train_data, f, ensure_ascii=False, indent=2)
    
print("Fixed explanation scores in train data")

Fixed explanation scores in train data


In [14]:
import os
import json

# Đường dẫn tới thư mục datasets và evaluation
datasets_dir = '../../../datasets/VQA-X'
evaluation_dir = os.path.join(datasets_dir, 'evaluation')
llm_models = ['llama', 'qwen', 'phi','gemma','gpt']
evaluation_files = ['val', 'test', 'train']

# Hàm kiểm tra các trường score
def check_scores_length(data):
    errors = []
    for entry in data:
        if len(entry['question_scores']) != len(entry['question']):
            errors.append(f"Mismatch in question_scores: {entry['id_image']}")
        if len(entry['answer_scores']) != len(entry['answer']):
            errors.append(f"Mismatch in answer_scores: {entry['id_image']}")
        for i, expl_scores in enumerate(entry['explanation_scores']):
            try:
                if len(expl_scores) != len(entry['explanation'][i]):
                    errors.append(f"Mismatch in explanation_scores[{i}]: {entry['id_image']}")
            except:
                errors.append(f"Error in explanation_scores[{i}]: {entry['id_image']}")

    return errors

# Kiểm tra các file JSON của từng LLM
for model in llm_models:
    for eval_file in evaluation_files:
        file_path = os.path.join(evaluation_dir, f"{eval_file}_{model}_evaluation.json")
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
                errors = check_scores_length(data)
                if errors:
                    print(f"Errors in {file_path}:")
                    for error in errors:
                        print(f"  - {error}")
                else:
                    print(f"No errors found in {eval_file}_{model}_evaluation.json")
        else:
            print(f"File {file_path} does not exist.")

No errors found in val_llama_evaluation.json
No errors found in test_llama_evaluation.json
No errors found in train_llama_evaluation.json
No errors found in val_qwen_evaluation.json
No errors found in test_qwen_evaluation.json
No errors found in train_qwen_evaluation.json
No errors found in val_phi_evaluation.json
No errors found in test_phi_evaluation.json
No errors found in train_phi_evaluation.json
No errors found in val_gemma_evaluation.json
No errors found in test_gemma_evaluation.json
No errors found in train_gemma_evaluation.json
No errors found in val_gpt_evaluation.json
No errors found in test_gpt_evaluation.json
No errors found in train_gpt_evaluation.json


## Merge the scores

In [3]:
import json
import os

datasets_dir = '../../../datasets/VQA-X'
evaluation_dir = os.path.join(datasets_dir, 'evaluation')
llm_models = ['llama', 'qwen', 'phi', 'gemma', 'gpt']
evaluation_files = ['train', 'val', 'test']

def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def merge_evaluation_data(**json_files):
    merged_data = {}
    
    for model_name, file_path in json_files.items():
        data = load_json(file_path)
        
        for entry in data:
            id_image = entry["id_image"]
            if id_image not in merged_data:
                merged_data[id_image] = {
                    "id_image": id_image,
                    "question": entry["question"],
                    "question_scores": {},
                    "answer": entry["answer"],
                    "answer_scores": {},
                    "explanation": entry["explanation"],
                    "explanation_scores": {model_name: entry["explanation_scores"]}
                }
            
            # Add scores with the model name as the key
            merged_data[id_image]["question_scores"][model_name] = entry["question_scores"]
            merged_data[id_image]["answer_scores"][model_name] = entry["answer_scores"]
        
            merged_data[id_image]["explanation_scores"][model_name] = entry["explanation_scores"]

    return list(merged_data.values())

# Process each dataset (train, val, test)
for evaluation_file in evaluation_files:
    # Define paths to the JSON files for each model and the current evaluation file
    json_files = {
        model: os.path.join(evaluation_dir, f"{evaluation_file}_{model}_evaluation.json")
        for model in llm_models
    }
    
    # Merge the data for the current evaluation file
    merged_data = merge_evaluation_data(**json_files)
    
    # Save the merged data
    output_path = os.path.join(evaluation_dir, f'vqaX_{evaluation_file}_evaluation.json')
    with open(output_path, 'w', encoding='utf-8') as outfile:
        json.dump(merged_data, outfile, indent=2, ensure_ascii=False)
    
    print(f"Merged data for {evaluation_file} saved to {output_path}")


Merged data for train saved to ../../../datasets/VQA-X/evaluation/vqaX_train_evaluation.json
Merged data for val saved to ../../../datasets/VQA-X/evaluation/vqaX_val_evaluation.json
Merged data for test saved to ../../../datasets/VQA-X/evaluation/vqaX_test_evaluation.json


## Test gpt

In [13]:
import openai
import json
from dotenv import load_dotenv
import os
load_dotenv()
# Đặt API key của bạn
openai.api_key = os.getenv("OPENAI_API_KEY")
prompt = """
You will be given an English question, answer, and explanations for context. Then, you will evaluate Vietnamese translations of the question, answer, and explanation(s). Evaluate each translation based on accuracy, fluency, and cultural appropriateness, considering the full context provided. Assign a score between 0 and 100 for each translation.

**Return the scores in the following JSON format and no additional text or explanations:**

{{
    "question_scores": [score_for_translation_1, score_for_translation_2, ...],
    "answer_scores": [score_for_translation_1, score_for_translation_2, ...],
    "explanation_scores": [
        [score for translations of explanation 1],
        [score for translations of explanation 2], (if multiple explanations are provided)
        ...
    ] (length of this list should match the number of explanations provided)
}}

Now, please evaluate the following:
"""
content_user = """
English Question: How many legs does a cat have?
Translation 1: Mèo có bao nhiêu chân?
Translation 2: Mèo có mấy chân?
Translation 3: Mèo có mấy cái chân?

English Answer: A cat has four legs.
Translation 1: Mèo có bốn chân.
Translation 2: Mèo có 4 chân.
Translation 3: Mèo có bốn cái chân.

Explanation 1: Cats are quadrupedal animals.
Translation 1: Mèo là động vật có bốn chân.
Translation 2: Mèo là loài động vật có bốn chân.
Translation 3: Mèo là loài động vật có bốn cái chân.
"""


messages = [
    {"role": "system", "content": prompt},
    {"role": "user", "content": content_user}
]

# Gọi API
response = openai.chat.completions.create(
    model="gpt-4o-mini",
    messages=messages,
    max_tokens=100,
    temperature=0.1,
    response_format={"type": "json_object"}
)


In [14]:
print(response.choices[0].message.content)
json_object = json.loads(response.choices[0].message.content)
json_object['explain_score']

{
    "question_score": [90, 95, 85],
    "answer_score": [95, 95, 85],
    "explain_score": [
        [90, 95, 85]
    ]
}


[[90, 95, 85]]

In [2]:
import json
import pandas as pd

# Đường dẫn đến các file dữ liệu
datasets_dir = '../../../datasets/VQA-X'
dataset_files = ['vqaX_train_translated.json', 'vqaX_val_translated.json', 'vqaX_test_translated.json']

# Hàm để lấy tất cả các câu trả lời từ một file
def get_answers_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    answers = []
    for key, item in data.items():
        answers.append((key, item['answer_vi_gemini']))
    
    return answers

# Lấy tất cả các câu trả lời từ tất cả các file
all_answers = []
for file in dataset_files:
    file_path = f"{datasets_dir}/{file}"
    all_answers.extend(get_answers_from_file(file_path))

sorted_answers = sorted(all_answers, key=lambda x: x[1])
df = pd.DataFrame(sorted_answers, columns=['id', 'answer_vi_gemini'])
output_file = 'vqax_answer_gemini.csv'
df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"Đã lưu {len(sorted_answers)} câu trả lời duy nhất vào file {output_file}")

Đã lưu 32886 câu trả lời duy nhất vào file vqax_answer_gemini.csv


## Calculate tokens

In [3]:
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import tiktoken
from collections import Counter
enc = tiktoken.encoding_for_model('gpt-4o-mini')
def count_tokens(text):
    return len(enc.encode(text))

def get_most_common_answers(answers, n=10):
    counter = Counter(answers)
    
    return counter.most_common(n)
def count_data_tokens(data: dict):
    total_tokens = 0
    quest_tokens = 0
    ans_tokens = 0
    expl_tokens = 0
    for key, value in data.items():
        # Count tokens in each question
        quest_tokens += count_tokens(value['question'])
        
        
        # just count most common answer
        answers = [ans['answer'] for ans in value['answers']]
        most_common_answer = get_most_common_answers(answers, 1)[0][0]
        ans_tokens += count_tokens(most_common_answer)
        
        # count tokens in each explanation
        if 'explanation' in value:
            for explanation in value['explanation']:
                expl_tokens += count_tokens(explanation)
    
    total_tokens = quest_tokens + ans_tokens + expl_tokens
    
    return {
        "total_tokens": total_tokens,
        "question_tokens": quest_tokens,
        "answer_tokens": ans_tokens,
        "explanation_tokens": expl_tokens 
    }

# example usage
tokens = count_data_tokens(train_data)
print(f"Total: {tokens['total_tokens']}")
print(f"Question: {tokens['question_tokens']}")
print(f"Answer: {tokens['answer_tokens']}")
print(f"Explanation: {tokens['explanation_tokens']}")


tokens = count_data_tokens(val_data)
print(f"Total: {tokens['total_tokens']}")
print(f"Question: {tokens['question_tokens']}")
print(f"Answer: {tokens['answer_tokens']}")
print(f"Explanation: {tokens['explanation_tokens']}")
tokens = count_data_tokens(test_data)
print(f"Total: {tokens['total_tokens']}")
print(f"Question: {tokens['question_tokens']}")
print(f"Answer: {tokens['answer_tokens']}")
print(f"Explanation: {tokens['explanation_tokens']}")

Total: 514854
Question: 193878
Answer: 40479
Explanation: 280497
Total: 52326
Question: 9648
Answer: 2195
Explanation: 40483
Total: 70226
Question: 12564
Answer: 2908
Explanation: 54754
