In [11]:
import json
import random
import os
from pathlib import Path

# Set random seed for reproducibility
random.seed(42)

# Path to the AlgoPuzzleVQA data directory
# data_dir = "AlgoPuzzleVQA/data"
data_dir = "PuzzleVQA/data"

# Find all JSON files in the data directory
json_files = []
for file in os.listdir(data_dir):
    if file.endswith('.json'):
        json_files.append(file)

print(f"Found {len(json_files)} JSON files:")
for file in json_files:
    print(f"  - {file}")

Found 20 JSON files:
  - circle_size_number.json
  - color_grid.json
  - color_hexagon.json
  - color_number_hexagon.json
  - color_overlap_squares.json
  - color_size_circle.json
  - grid_number.json
  - grid_number_color.json
  - polygon_sides_color.json
  - polygon_sides_number.json
  - rectangle_height_color.json
  - rectangle_height_number.json
  - shape_morph.json
  - shape_reflect.json
  - shape_size_grid.json
  - shape_size_hexagon.json
  - size_cycle.json
  - size_grid.json
  - triangle.json
  - venn.json


In [12]:
# Calculate split point (half of total files)
total_files = len(json_files)
split_point = total_files // 2

# Randomly split into training and testing sets
random.shuffle(json_files)
train_files = json_files[:split_point]
test_files = json_files[split_point:]

print(f"\nTotal files: {total_files}")
print(f"Split point: {split_point}")
print(f"\nTraining files ({len(train_files)}):")
for file in train_files:
    print(f"  - {file}")

print(f"\nTesting files ({len(test_files)}):")
for file in test_files:
    print(f"  - {file}")


Total files: 20
Split point: 10

Training files (10):
  - venn.json
  - color_size_circle.json
  - shape_size_grid.json
  - color_overlap_squares.json
  - polygon_sides_number.json
  - shape_reflect.json
  - shape_size_hexagon.json
  - triangle.json
  - grid_number.json
  - shape_morph.json

Testing files (10):
  - size_grid.json
  - rectangle_height_color.json
  - color_grid.json
  - rectangle_height_number.json
  - color_hexagon.json
  - size_cycle.json
  - grid_number_color.json
  - polygon_sides_color.json
  - circle_size_number.json
  - color_number_hexagon.json


In [13]:
def flatten_json_files(file_list, output_file):
    """Flatten multiple JSON files into a single JSONL file"""
    with open(output_file, 'w') as outfile:
        for json_file in file_list:
            file_path = os.path.join(data_dir, json_file)
            print(f"Processing {json_file}...")
            
            with open(file_path, 'r') as infile:
                for line in infile:
                    line = line.strip()
                    if line:  # Skip empty lines
                        # Parse the JSON object and write it to the output file
                        try:
                            json_obj = json.loads(line)
                            # Add source_file field to track origin
                            json_obj["source_file"] = json_file
                            outfile.write(json.dumps(json_obj) + '\n')
                        except json.JSONDecodeError as e:
                            print(f"Warning: Could not parse JSON in {json_file}: {e}")

In [14]:
# Create training JSONL file
print("\nCreating training JSONL file...")
flatten_json_files(train_files, "PuzzleVQA_train.jsonl")


Creating training JSONL file...
Processing venn.json...
Processing color_size_circle.json...
Processing shape_size_grid.json...
Processing color_overlap_squares.json...
Processing polygon_sides_number.json...
Processing shape_reflect.json...
Processing shape_size_hexagon.json...
Processing triangle.json...
Processing grid_number.json...
Processing shape_morph.json...


In [15]:
# Create testing JSONL file
print("\nCreating testing JSONL file...")
flatten_json_files(test_files, "PuzzleVQA_test.jsonl")


Creating testing JSONL file...
Processing size_grid.json...
Processing rectangle_height_color.json...
Processing color_grid.json...
Processing rectangle_height_number.json...
Processing color_hexagon.json...
Processing size_cycle.json...
Processing grid_number_color.json...
Processing polygon_sides_color.json...
Processing circle_size_number.json...
Processing color_number_hexagon.json...


In [16]:
# Count lines in output files
def count_lines(filename):
    with open(filename, 'r') as f:
        return sum(1 for line in f)

# print(f"\nTraining file: AlgoPuzzleVQA_train.jsonl ({count_lines('AlgoPuzzleVQA_train.jsonl')} records)")
# print(f"Testing file: AlgoPuzzleVQA_test.jsonl ({count_lines('AlgoPuzzleVQA_test.jsonl')} records)")
print(f"\nTraining file: PuzzleVQA_train.jsonl ({count_lines('PuzzleVQA_train.jsonl')} records)")
print(f"Testing file: PuzzleVQA_test.jsonl ({count_lines('PuzzleVQA_test.jsonl')} records)")

print("\nDone!")


Training file: PuzzleVQA_train.jsonl (1000 records)
Testing file: PuzzleVQA_test.jsonl (1000 records)

Done!


In [17]:
subset_output_file = "PuzzleVQA_train.jsonl"
print("Sample record with UID:", json.loads(open(subset_output_file).readline()))

Sample record with UID: {'image': 'images/venn/venn_0000.png', 'question': 'What is the missing number of the part denoted with a question mark?', 'answer': 1, 'options': ['4', '8', '1', '9'], 'caption': "There are 3 overlapping circles containing the numbers [7, 9, '?']. The overlapping part between the first and second circle contains the number 16. The overlapping part between the second and third circle contains the number 10.", 'explanation': 'We observe that the circles with 7 and 9 overlap to form the part 16, where 7 + 9 = 16. Hence, the pattern is most likely that the numbers in the overlapping parts are the sum of the numbers in the corresponding circles.', 'deduction': 'Based on the pattern that the numbers in the overlapping parts are the sum of the numbers in the corresponding circles, the missing number of the circle where the overlapping part is 10 should be 1.', 'source_file': 'venn.json'}


In [18]:
source_image_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/PuzzleVQA/data/images"
loaded_image_path = "images/venn/venn_0000.png"

source_image_path = "/".join(loaded_image_path.split("/")[1::])
full_source_image_path = os.path.join(source_image_dir, source_image_path)

print("full_source_image_path: ", full_source_image_path)
# print("loaded_image_path: ", loaded_image_path)


image_filename = os.path.basename(loaded_image_path)
print("image_filename: ", image_filename)

full_source_image_path:  /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/PuzzleVQA/data/images/venn/venn_0000.png
image_filename:  venn_0000.png


In [25]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "AlgoPuzzleVQA_train.jsonl"
# Create final JSONL with absolute image paths
final_output_file = "AlgoPuzzleVQA_train_run1_1K_v1_subset.jsonl"
base_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest"
# absolute_subset_images_dir = os.path.join(base_dir, "subset_images")
absolute_subset_images_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/AlgoPuzzleVQA/data"

print(f"Creating final JSONL with absolute image paths: {final_output_file}")

with open(subset_output_file, 'r') as input_f, open(final_output_file, 'w') as output_f:
    for line in input_f:
        record = json.loads(line.strip())
        
        # Extract original filename
        image_filename = record["image"]
        
        # Create absolute path to the copied image
        absolute_image_path = os.path.join(absolute_subset_images_dir, image_filename)
        
        # Add the image_path field
        record["image_path"] = absolute_image_path
        
        # Write the updated record
        output_f.write(json.dumps(record) + "\n")

print(f"Created final output file: {final_output_file}")

# Show sample of final output
with open(final_output_file, 'r') as f:
    sample_record = json.loads(f.readline())
    print("Sample final record:")
    print(json.dumps(sample_record, indent=2))

Creating final JSONL with absolute image paths: AlgoPuzzleVQA_train_run1_1K_v1_subset.jsonl
Created final output file: AlgoPuzzleVQA_train_run1_1K_v1_subset.jsonl
Sample final record:
{
  "image": "images/think_dot/think_dot_0000.jpg",
  "question": "The toy shown in the figure has eight coloured disks on its front, and three holes on its top \u2013 left, right, and center \u2013 through which a ball bearing could be dropped. Each disk would display either a yellow or blue face. When a ball passes through a disc it tips the disk mechanism which flips the face color. The tipping of the disc mechanism determines whether the ball would be deflected to the left or to the right. The vertical walls between the discs would then determine the path of motion of the ball. A dropped ball always passes through exactly one disc in each of the top and the bottom row. Depending on the configuration of the top three discs it may or may not pass through the middle row. Finally, when the ball falls to t

In [33]:
import json
import os
from pathlib import Path

def validate_image_paths(jsonl_file):
    """Validate all image paths in the JSONL file and check they correspond to actual PNG files."""
    
    valid_paths = []
    invalid_paths = []
    missing_files = []
    
    print(f"Validating image paths in: {jsonl_file}")
    print("=" * 60)
    
    # Read the JSONL file
    with open(jsonl_file, 'r') as f:
        for line_num, line in enumerate(f, 1):
            try:
                record = json.loads(line.strip())
                image_path = record.get('image_path')
                
                if not image_path:
                    print(f"Line {line_num}: Missing 'image_path' field")
                    invalid_paths.append((line_num, None, "Missing image_path field"))
                    continue
                
                # Check if file exists
                if os.path.exists(image_path):
                    # Check if it's a PNG file
                    if image_path.lower().endswith('.png') or image_path.lower().endswith('.jpg'):
                        # Check if it's actually a file (not a directory)
                        if os.path.isfile(image_path):
                            valid_paths.append((line_num, image_path))
                        else:
                            print(f"Line {line_num}: Path exists but is not a file: {image_path}")
                            invalid_paths.append((line_num, image_path, "Not a file"))
                    else:
                        print(f"Line {line_num}: File exists but is not PNG: {image_path}")
                        invalid_paths.append((line_num, image_path, "Not PNG file"))
                else:
                    print(f"Line {line_num}: File not found: {image_path}")
                    missing_files.append((line_num, image_path))
                    invalid_paths.append((line_num, image_path, "File not found"))
                    
            except json.JSONDecodeError as e:
                print(f"Line {line_num}: JSON decode error: {e}")
                invalid_paths.append((line_num, None, f"JSON error: {e}"))
    
    # Summary
    print("\n" + "=" * 60)
    print("VALIDATION SUMMARY")
    print("=" * 60)
    print(f"Total records processed: {len(valid_paths) + len(invalid_paths)}")
    print(f"Valid PNG files: {len(valid_paths)}")
    print(f"Invalid/missing files: {len(invalid_paths)}")
    print(f"Missing files: {len(missing_files)}")
    
    if valid_paths:
        print(f"\nFirst 5 valid paths:")
        for line_num, path in valid_paths[:5]:
            print(f"  Line {line_num}: {path}")
    
    if missing_files:
        print(f"\nFirst 5 missing files:")
        for line_num, path in missing_files[:5]:
            print(f"  Line {line_num}: {path}")
    
    # Return results for further analysis
    return {
        'valid_paths': valid_paths,
        'invalid_paths': invalid_paths,
        'missing_files': missing_files,
        'total_records': len(valid_paths) + len(invalid_paths)
    }

# Run the validation
jsonl_file = "./prepared_jsonl/AlgoPuzzleVQA_train_run1_1K_v1_subset.jsonl"
# jsonl_file = "./prepared_jsonl/PuzzleVQA_train_run1_1K_v1_subset.jsonl"
results = validate_image_paths(jsonl_file)

# Additional analysis
print(f"\nSuccess rate: {len(results['valid_paths'])}/{results['total_records']} = {len(results['valid_paths'])/results['total_records']*100:.1f}%")

Validating image paths in: ./prepared_jsonl/AlgoPuzzleVQA_train_run1_1K_v1_subset.jsonl

VALIDATION SUMMARY
Total records processed: 900
Valid PNG files: 900
Invalid/missing files: 0
Missing files: 0

First 5 valid paths:
  Line 1: /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/AlgoPuzzleVQA/data/images/think_dot/think_dot_0000.jpg
  Line 2: /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/AlgoPuzzleVQA/data/images/think_dot/think_dot_0001.jpg
  Line 3: /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/AlgoPuzzleVQA/data/images/think_dot/think_dot_0002.jpg
  Line 4: /data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/PuzzleTest/AlgoPuzzleVQA/data/images/think_dot/think_dot_0003.jpg
  