In [1]:
import json
import random
import uuid
import os
import shutil
from pathlib import Path
import pandas as pd
from collections import defaultdict

print("Imports completed successfully")

Imports completed successfully


In [11]:
import json
import random
import os
from pathlib import Path

# Set random seed for reproducibility
random.seed(42)

# Path to the AlgoPuzzleVQA data directory
# data_dir = "AlgoPuzzleVQA/data"
data_dir = "PuzzleVQA/data"

# Find all JSON files in the data directory
json_files = []
for file in os.listdir(data_dir):
    if file.endswith('.json'):
        json_files.append(file)

print(f"Found {len(json_files)} JSON files:")
for file in json_files:
    print(f"  - {file}")

Found 20 JSON files:
  - circle_size_number.json
  - color_grid.json
  - color_hexagon.json
  - color_number_hexagon.json
  - color_overlap_squares.json
  - color_size_circle.json
  - grid_number.json
  - grid_number_color.json
  - polygon_sides_color.json
  - polygon_sides_number.json
  - rectangle_height_color.json
  - rectangle_height_number.json
  - shape_morph.json
  - shape_reflect.json
  - shape_size_grid.json
  - shape_size_hexagon.json
  - size_cycle.json
  - size_grid.json
  - triangle.json
  - venn.json


In [12]:
# Calculate split point (half of total files)
total_files = len(json_files)
split_point = total_files // 2

# Randomly split into training and testing sets
random.shuffle(json_files)
train_files = json_files[:split_point]
test_files = json_files[split_point:]

print(f"\nTotal files: {total_files}")
print(f"Split point: {split_point}")
print(f"\nTraining files ({len(train_files)}):")
for file in train_files:
    print(f"  - {file}")

print(f"\nTesting files ({len(test_files)}):")
for file in test_files:
    print(f"  - {file}")


Total files: 20
Split point: 10

Training files (10):
  - venn.json
  - color_size_circle.json
  - shape_size_grid.json
  - color_overlap_squares.json
  - polygon_sides_number.json
  - shape_reflect.json
  - shape_size_hexagon.json
  - triangle.json
  - grid_number.json
  - shape_morph.json

Testing files (10):
  - size_grid.json
  - rectangle_height_color.json
  - color_grid.json
  - rectangle_height_number.json
  - color_hexagon.json
  - size_cycle.json
  - grid_number_color.json
  - polygon_sides_color.json
  - circle_size_number.json
  - color_number_hexagon.json


In [13]:
def flatten_json_files(file_list, output_file):
    """Flatten multiple JSON files into a single JSONL file"""
    with open(output_file, 'w') as outfile:
        for json_file in file_list:
            file_path = os.path.join(data_dir, json_file)
            print(f"Processing {json_file}...")
            
            with open(file_path, 'r') as infile:
                for line in infile:
                    line = line.strip()
                    if line:  # Skip empty lines
                        # Parse the JSON object and write it to the output file
                        try:
                            json_obj = json.loads(line)
                            # Add source_file field to track origin
                            json_obj["source_file"] = json_file
                            outfile.write(json.dumps(json_obj) + '\n')
                        except json.JSONDecodeError as e:
                            print(f"Warning: Could not parse JSON in {json_file}: {e}")

In [14]:
# Create training JSONL file
print("\nCreating training JSONL file...")
flatten_json_files(train_files, "PuzzleVQA_train.jsonl")


Creating training JSONL file...
Processing venn.json...
Processing color_size_circle.json...
Processing shape_size_grid.json...
Processing color_overlap_squares.json...
Processing polygon_sides_number.json...
Processing shape_reflect.json...
Processing shape_size_hexagon.json...
Processing triangle.json...
Processing grid_number.json...
Processing shape_morph.json...


In [15]:
# Create testing JSONL file
print("\nCreating testing JSONL file...")
flatten_json_files(test_files, "PuzzleVQA_test.jsonl")


Creating testing JSONL file...
Processing size_grid.json...
Processing rectangle_height_color.json...
Processing color_grid.json...
Processing rectangle_height_number.json...
Processing color_hexagon.json...
Processing size_cycle.json...
Processing grid_number_color.json...
Processing polygon_sides_color.json...
Processing circle_size_number.json...
Processing color_number_hexagon.json...


In [16]:
# Count lines in output files
def count_lines(filename):
    with open(filename, 'r') as f:
        return sum(1 for line in f)

# print(f"\nTraining file: AlgoPuzzleVQA_train.jsonl ({count_lines('AlgoPuzzleVQA_train.jsonl')} records)")
# print(f"Testing file: AlgoPuzzleVQA_test.jsonl ({count_lines('AlgoPuzzleVQA_test.jsonl')} records)")
print(f"\nTraining file: PuzzleVQA_train.jsonl ({count_lines('PuzzleVQA_train.jsonl')} records)")
print(f"Testing file: PuzzleVQA_test.jsonl ({count_lines('PuzzleVQA_test.jsonl')} records)")

print("\nDone!")


Training file: PuzzleVQA_train.jsonl (1000 records)
Testing file: PuzzleVQA_test.jsonl (1000 records)

Done!


In [3]:
# Load the full JSONL file
input_file = "full_vqav2_en_20240402_int.jsonl"
data = []

print(f"Loading data from {input_file}...")
with open(input_file, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

print(f"Loaded {len(data)} records")
print("Sample record:", data[0])

Loading data from full_vqav2_en_20240402_int.jsonl...
Loaded 5423 records
Sample record: {'image': 'MMPR-v1.2-prompts/correctness_images/vqav2_en_20240402_int/COCO_train2014_000000000086.jpg', 'question': 'What is the number on the mailbox? Answer the question using a single word or phrase.', 'answer': '24'}


In [4]:
# Group by unique images to ensure no duplicates
unique_images = {}
for record in data:
    image_path = record["image"]
    if image_path not in unique_images:
        unique_images[image_path] = record

print(f"Found {len(unique_images)} unique images")

Found 4451 unique images


In [5]:
# Sample 15K unique records
sample_size = 4000
unique_records = list(unique_images.values())

if len(unique_records) < sample_size:
    print(f"Warning: Only {len(unique_records)} unique images available, less than requested {sample_size}")
    sampled_data = unique_records
else:
    # Use random.sample for sampling without replacement
    random.seed(42)  # For reproducibility
    sampled_data = random.sample(unique_records, sample_size)

print(f"Selected {len(sampled_data)} records for subset")

Selected 4000 records for subset


In [6]:
# Add UUID to each record and save subset
subset_output_file = "vqav2_int_only_4K_v1_subset.jsonl"

print("Adding UIDs and saving subset...")
with open(subset_output_file, 'w') as f:
    for record in sampled_data:
        # Add UID to the record
        record_with_uid = record.copy()
        record_with_uid["uid"] = str(uuid.uuid4())
        
        # Write to file
        f.write(json.dumps(record_with_uid) + "\n")

print(f"Saved {len(sampled_data)} records to {subset_output_file}")
print("Sample record with UID:", json.loads(open(subset_output_file).readline()))

Adding UIDs and saving subset...
Saved 4000 records to vqav2_int_only_4K_v1_subset.jsonl
Sample record with UID: {'image': 'MMPR-v1.2-prompts/correctness_images/vqav2_en_20240402_int/COCO_train2014_000000287656.jpg', 'question': 'What number is the guy with the red number on his back? Answer the question using a single word or phrase.', 'answer': '24', 'uid': '6ac68031-7f0e-4949-b091-df92c0d1ff43'}


In [7]:
source_image_dir = "/data/users/brandon/ob1-projects/MMPR-v1.2-prompts"
loaded_image_path = "MMPR-v1.2-prompts/correctness_images/vqav2_en_20240402_int/COCO_train2014_000000287656.jpg"

source_image_path = "/".join(loaded_image_path.split("/")[1::])
full_source_image_path = os.path.join(source_image_dir, source_image_path)

print("full_source_image_path: ", full_source_image_path)
# print("loaded_image_path: ", loaded_image_path)


image_filename = os.path.basename(loaded_image_path)
print("image_filename: ", image_filename)

full_source_image_path:  /data/users/brandon/ob1-projects/MMPR-v1.2-prompts/correctness_images/vqav2_en_20240402_int/COCO_train2014_000000287656.jpg
image_filename:  COCO_train2014_000000287656.jpg


In [8]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "vqav2_int_only_4K_v1_subset.jsonl"
subset_images_dir = "subset_images"
os.makedirs(subset_images_dir, exist_ok=True)

print(f"Copying images to {subset_images_dir} directory...")

# Read the subset file to get the image paths
copied_count = 0
failed_count = 0

with open(subset_output_file, 'r') as f:
    for line in f:
        record = json.loads(line.strip())
        source_image_path = "/".join(record["image"].split("/")[1::])

        source_image_dir = "/data/users/brandon/ob1-projects/MMPR-v1.2-prompts"

        full_source_image_path = os.path.join(source_image_dir, source_image_path)

        image_filename = os.path.basename(source_image_path)
        
        dest_image_path = os.path.join(subset_images_dir, image_filename)
        
        # Skip if destination already exists to avoid overwriting
        if os.path.exists(dest_image_path):
            print(f"Skipping {image_filename} - already exists")
            copied_count += 1
            continue
        
        try:
            if os.path.exists(full_source_image_path):
                shutil.copy2(full_source_image_path, dest_image_path)
                copied_count += 1
                if copied_count % 1000 == 0:  # Progress update every 1000 files
                    print(f"Copied {copied_count} images so far...")
            else:
                print(f"Warning: Source image not found: {full_source_image_path}")
                failed_count += 1
        except Exception as e:
            print(f"Error copying {full_source_image_path}: {e}")
            failed_count += 1

print(f"Successfully copied {copied_count} images")
print(f"Failed to copy {failed_count} images")

Copying images to subset_images directory...
Copied 1000 images so far...
Copied 2000 images so far...
Copied 3000 images so far...
Copied 4000 images so far...
Successfully copied 4000 images
Failed to copy 0 images


In [9]:
# open subset_images directory and count the number of files
import os

subset_images_dir = "subset_images"

# Count the number of files in the directory
num_files = len(os.listdir(subset_images_dir))
print(f"Number of files in {subset_images_dir}: {num_files}")

Number of files in subset_images: 4000


In [10]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "vqav2_int_only_4K_v1_subset.jsonl"
# Create final JSONL with absolute image paths
final_output_file = "vqav2_run1_int_only_4K_v1_subset.jsonl"
base_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/VQAv2"
absolute_subset_images_dir = os.path.join(base_dir, "subset_images")

print(f"Creating final JSONL with absolute image paths: {final_output_file}")

with open(subset_output_file, 'r') as input_f, open(final_output_file, 'w') as output_f:
    for line in input_f:
        record = json.loads(line.strip())
        
        # Extract original filename
        image_filename = os.path.basename(record["image"])
        
        # Create absolute path to the copied image
        absolute_image_path = os.path.join(absolute_subset_images_dir, image_filename)
        
        # Add the image_path field
        record["image_path"] = absolute_image_path
        
        # Write the updated record
        output_f.write(json.dumps(record) + "\n")

print(f"Created final output file: {final_output_file}")

# Show sample of final output
with open(final_output_file, 'r') as f:
    sample_record = json.loads(f.readline())
    print("Sample final record:")
    print(json.dumps(sample_record, indent=2))

Creating final JSONL with absolute image paths: vqav2_run1_int_only_4K_v1_subset.jsonl
Created final output file: vqav2_run1_int_only_4K_v1_subset.jsonl
Sample final record:
{
  "image": "MMPR-v1.2-prompts/correctness_images/vqav2_en_20240402_int/COCO_train2014_000000287656.jpg",
  "question": "What number is the guy with the red number on his back? Answer the question using a single word or phrase.",
  "answer": "24",
  "uid": "6ac68031-7f0e-4949-b091-df92c0d1ff43",
  "image_path": "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/VQAv2/subset_images/COCO_train2014_000000287656.jpg"
}
