In [1]:
import json
import random
import uuid
import os
import shutil
from pathlib import Path
import pandas as pd
from collections import defaultdict

print("Imports completed successfully")

Imports completed successfully


In [2]:
# Load the full JSONL file
input_file = "full_dvqa_en_20240402_extracted_int_only.jsonl"
data = []

print(f"Loading data from {input_file}...")
with open(input_file, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

print(f"Loaded {len(data)} records")
print("Sample record:", data[0])

Loading data from full_dvqa_en_20240402_extracted_int_only.jsonl...
Loaded 20442 records
Sample record: {'image': 'MMPR-v1.2-prompts/correctness_images/dvqa_en_20240402_extracted_int_only/images/bar_train_00109559.png', 'question': 'What is the value of the largest individual bar in the whole chart? Answer the question using a single word or phrase.', 'answer': '6'}


In [3]:
# Group by unique images to ensure no duplicates
unique_images = {}
for record in data:
    image_path = record["image"]
    if image_path not in unique_images:
        unique_images[image_path] = record

print(f"Found {len(unique_images)} unique images")

# Sample 15K unique records
sample_size = 15000
unique_records = list(unique_images.values())

if len(unique_records) < sample_size:
    print(f"Warning: Only {len(unique_records)} unique images available, less than requested {sample_size}")
    sampled_data = unique_records
else:
    # Use random.sample for sampling without replacement
    random.seed(42)  # For reproducibility
    sampled_data = random.sample(unique_records, sample_size)

print(f"Selected {len(sampled_data)} records for subset")

Found 19609 unique images
Selected 15000 records for subset


In [4]:
# Add UUID to each record and save subset
subset_output_file = "dvqa_int_only_15K_v1_subset.jsonl"

print("Adding UIDs and saving subset...")
with open(subset_output_file, 'w') as f:
    for record in sampled_data:
        # Add UID to the record
        record_with_uid = record.copy()
        record_with_uid["uid"] = str(uuid.uuid4())
        
        # Write to file
        f.write(json.dumps(record_with_uid) + "\n")

print(f"Saved {len(sampled_data)} records to {subset_output_file}")
print("Sample record with UID:", json.loads(open(subset_output_file).readline()))

Adding UIDs and saving subset...
Saved 15000 records to dvqa_int_only_15K_v1_subset.jsonl
Sample record with UID: {'image': 'MMPR-v1.2-prompts/correctness_images/dvqa_en_20240402_extracted_int_only/images/bar_train_00023964.png', 'question': 'What is the value of the smallest individual bar in the whole chart? Answer the question using a single word or phrase.', 'answer': '2', 'uid': '70341b17-3467-4a92-b4bf-872853fd34c1'}


In [None]:
# Copy images to subset_images directory
subset_images_dir = "subset_images"
os.makedirs(subset_images_dir, exist_ok=True)

print(f"Copying images to {subset_images_dir} directory...")

# Read the subset file to get the image paths
copied_count = 0
failed_count = 0

with open(subset_output_file, 'r') as f:
    for line in f:
        record = json.loads(line.strip())
        image_path = record["image"]
        
        # Find the image file (assuming it's in the parent directory structure)
        # The image path in the JSONL is relative, we need to find the actual file
        source_image_path = f"/data/users/brandon/ob1-projects/InternVL/{image_path}"
        
        # Extract filename from path
        image_filename = os.path.basename(image_path)
        dest_image_path = os.path.join(subset_images_dir, image_filename)
        
        try:
            if os.path.exists(source_image_path):
                shutil.copy2(source_image_path, dest_image_path)
                copied_count += 1
            else:
                print(f"Warning: Source image not found: {source_image_path}")
                failed_count += 1
        except Exception as e:
            print(f"Error copying {source_image_path}: {e}")
            failed_count += 1

print(f"Successfully copied {copied_count} images")
print(f"Failed to copy {failed_count} images")

In [None]:
# Create final JSONL with absolute image paths
final_output_file = "dvqa_run1_int_only.jsonl"
base_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/DVQA"
absolute_subset_images_dir = os.path.join(base_dir, "subset_images")

print(f"Creating final JSONL with absolute image paths: {final_output_file}")

with open(subset_output_file, 'r') as input_f, open(final_output_file, 'w') as output_f:
    for line in input_f:
        record = json.loads(line.strip())
        
        # Extract original filename
        image_filename = os.path.basename(record["image"])
        
        # Create absolute path to the copied image
        absolute_image_path = os.path.join(absolute_subset_images_dir, image_filename)
        
        # Add the image_path field
        record["image_path"] = absolute_image_path
        
        # Write the updated record
        output_f.write(json.dumps(record) + "\n")

print(f"Created final output file: {final_output_file}")

# Show sample of final output
with open(final_output_file, 'r') as f:
    sample_record = json.loads(f.readline())
    print("Sample final record:")
    print(json.dumps(sample_record, indent=2))