In [1]:
import json
from datasets import load_dataset
import os
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
num_samples = 100
seed = 42
full_dataset = load_dataset("lmms-lab/OK-VQA", split="val2014")
full_dataset = full_dataset.shuffle(seed=seed)
dummy_subset = full_dataset.select(range(num_samples))
images_ouput_dir = "data/images/val2014"
annotations_dir = os.path.dirname("data/annotations/okvqa_dummy_100.json")

os.makedirs(annotations_dir, exist_ok=True)
os.makedirs(images_ouput_dir, exist_ok=True)

processed_data = []
for item in dummy_subset:
    clean_item = {
        "id": item["question_id"],
        "question": item["question"],
        "image_id": item["question_id"],
        "golden_answers": item["answers"]
    }
    processed_data.append(clean_item)

    image_obj = item["image"]
    image_filename = f"{item['question_id']}.jpg"
    save_path = os.path.join(images_ouput_dir, image_filename)

    if not os.path.exists(save_path):
        try:
            image_obj.save(save_path, 'JPEG')
        except Exception as e:
            tqdm.write(f"Could not save image {save_path}. Error: {e}")


In [4]:
output_filename = "okvqa_dummy_100.json"
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(processed_data, f, indent=4, ensure_ascii=False)

In [1]:
import json

def convert_json_to_jsonl(input_file_path, output_file_path):
    """
    Reads a standard JSON file (expected to be a list of objects) and
    writes it to a JSON Lines (.jsonl) file.

    Args:
        input_file_path (str): The path to the input .json file.
        output_file_path (str): The path to the output .jsonl file.
    """
    try:
        with open(input_file_path, 'r', encoding='utf-8') as infile:
            # 1. 一次性读取整个JSON文件，它会被解析成一个Python列表
            data = json.load(infile)

        # 确保输入文件确实是一个列表
        if not isinstance(data, list):
            print(f"Error: The input JSON file '{input_file_path}' does not contain a list of objects.")
            return

        with open(output_file_path, 'w', encoding='utf-8') as outfile:
            # 2. 遍历列表中的每一个对象（字典）
            for entry in data:
                # 3. 将每个对象转换成一个JSON格式的字符串
                json_string = json.dumps(entry, ensure_ascii=False)
                
                # 4. 将这个字符串写入新文件，并在末尾添加一个换行符
                outfile.write(json_string + '\n')
        
        print(f"Successfully converted '{input_file_path}' to '{output_file_path}'.")
        print(f"Total entries written: {len(data)}")

    except FileNotFoundError:
        print(f"Error: The file '{input_file_path}' was not found.")
    except json.JSONDecodeError:
        print(f"Error: The file '{input_file_path}' is not a valid JSON file.")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [2]:
input_file = "data/okvqa_dummy_100/dev.json"
output_file = "data/okvqa_dummy_100/dev.jsonl"

convert_json_to_jsonl(input_file, output_file)

Successfully converted 'data/okvqa_dummy_100/dev.json' to 'data/okvqa_dummy_100/dev.jsonl'.
Total entries written: 100


In [3]:
import shutil
import json
input_filename = 'data/okvqa_dummy_100/test.jsonl'
# 使用一个临时文件名，处理完成后会用它替换原文件
temp_filename = 'output.jsonl.tmp'

try:
    # 1. 打开原始文件进行读取，同时打开临时文件进行写入
    # 使用 'utf-8' 编码以支持中文字符
    with open(input_filename, 'r', encoding='utf-8') as infile, \
         open(temp_filename, 'w', encoding='utf-8') as outfile:
        
        # 2. 逐行读取和处理
        for line in infile:
            # 去除行尾可能存在的换行符
            line = line.strip()
            if not line:
                continue # 如果是空行，则跳过

            # 3. 将JSON字符串解析为Python字典
            data = json.loads(line)
            
            # 4. 检查'id'字段是否存在，如果存在则添加新的'image_id'字段
            if 'id' in data:
                data['image_id'] = data['id']
            
            # 5. 将修改后的字典转换回JSON字符串，并写入临时文件
            # ensure_ascii=False 保证中文字符能正常显示而非Unicode编码
            # 别忘了在末尾添加换行符，以保持JSONL格式
            outfile.write(json.dumps(data, ensure_ascii=False) + '\n')

    # 6. 如果整个过程没有出错，用处理好的临时文件替换原始文件
    shutil.move(temp_filename, input_filename)
    
    print(f"处理完成！文件 '{input_filename}' 已成功更新。")

except FileNotFoundError:
    print(f"错误：找不到文件 '{input_filename}'。请确保文件在正确的路径下。")
except json.JSONDecodeError as e:
    print(f"错误：文件 '{input_filename}' 中存在无效的JSON格式。错误信息: {e}")
    # 如果出错，删除可能已创建的临时文件
    if os.path.exists(temp_filename):
        os.remove(temp_filename)
except Exception as e:
    print(f"发生未知错误: {e}")
    # 如果出错，删除可能已创建的临时文件
    if os.path.exists(temp_filename):
        os.remove(temp_filename)

处理完成！文件 'data/okvqa_dummy_100/test.jsonl' 已成功更新。
