In [1]:
import json
import random
import uuid
import os
import shutil
from pathlib import Path
import pandas as pd
from collections import defaultdict

print("Imports completed successfully")

Imports completed successfully


In [2]:
# Load the full JSONL file
input_file = "full_CLEVR_math_en_20240402_extracted.jsonl"
data = []

print(f"Loading data from {input_file}...")
with open(input_file, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

print(f"Loaded {len(data)} records")
print("Sample record:", data[0])

Loading data from full_CLEVR_math_en_20240402_extracted.jsonl...
Loaded 20596 records
Sample record: {'image': 'MMPR-v1.2-prompts/correctness_images/CLEVR_math_en_20240402_extracted/CLEVR_v1.0/images/train/CLEVR_train_034503.png', 'question': 'Subtract 1 balls. How many balls are left?\nAnswer the question using a single word or phrase.', 'answer': '2'}


In [3]:
# Group by unique images to ensure no duplicates
unique_images = {}
for record in data:
    image_path = record["image"]
    if image_path not in unique_images:
        unique_images[image_path] = record

print(f"Found {len(unique_images)} unique images")

# Sample 15K unique records
sample_size = 15000
unique_records = list(unique_images.values())

if len(unique_records) < sample_size:
    print(f"Warning: Only {len(unique_records)} unique images available, less than requested {sample_size}")
    sampled_data = unique_records
else:
    # Use random.sample for sampling without replacement
    random.seed(42)  # For reproducibility
    sampled_data = random.sample(unique_records, sample_size)

print(f"Selected {len(sampled_data)} records for subset")

Found 18012 unique images
Selected 15000 records for subset


In [4]:
# Add UUID to each record and save subset
subset_output_file = "CLEVR_int_only_15K_v1_subset.jsonl"

print("Adding UIDs and saving subset...")
with open(subset_output_file, 'w') as f:
    for record in sampled_data:
        # Add UID to the record
        record_with_uid = record.copy()
        record_with_uid["uid"] = str(uuid.uuid4())
        
        # Write to file
        f.write(json.dumps(record_with_uid) + "\n")

print(f"Saved {len(sampled_data)} records to {subset_output_file}")
print("Sample record with UID:", json.loads(open(subset_output_file).readline()))

Adding UIDs and saving subset...
Saved 15000 records to CLEVR_int_only_15K_v1_subset.jsonl
Sample record with UID: {'image': 'MMPR-v1.2-prompts/correctness_images/CLEVR_math_en_20240402_extracted/CLEVR_v1.0/images/train/CLEVR_train_029709.png', 'question': 'Subtract all cylinders. How many objects are left?\nAnswer the question using a single word or phrase.', 'answer': '3', 'uid': 'ab9dfdb6-8982-4c31-af96-7f08b52fbc0e'}


In [5]:
source_image_dir = "/data/users/brandon/ob1-projects/MMPR-v1.2-prompts"
loaded_image_path = "MMPR-v1.2-prompts/correctness_images/CLEVR_math_en_20240402_extracted/CLEVR_v1.0/images/train/CLEVR_train_029709.png"

source_image_path = "/".join(loaded_image_path.split("/")[1::])
full_source_image_path = os.path.join(source_image_dir, source_image_path)

print("full_source_image_path: ", full_source_image_path)
# print("loaded_image_path: ", loaded_image_path)


image_filename = os.path.basename(loaded_image_path)
print("image_filename: ", image_filename)

full_source_image_path:  /data/users/brandon/ob1-projects/MMPR-v1.2-prompts/correctness_images/CLEVR_math_en_20240402_extracted/CLEVR_v1.0/images/train/CLEVR_train_029709.png
image_filename:  CLEVR_train_029709.png


In [1]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "CLEVR_int_only_15K_v1_subset.jsonl"
subset_images_dir = "subset_images"
os.makedirs(subset_images_dir, exist_ok=True)

print(f"Copying images to {subset_images_dir} directory...")

# Read the subset file to get the image paths
copied_count = 0
failed_count = 0

with open(subset_output_file, 'r') as f:
    for line in f:
        record = json.loads(line.strip())
        source_image_path = "/".join(record["image"].split("/")[1::])

        source_image_dir = "/data/users/brandon/ob1-projects/MMPR-v1.2-prompts"

        full_source_image_path = os.path.join(source_image_dir, source_image_path)

        image_filename = os.path.basename(source_image_path)
        
        dest_image_path = os.path.join(subset_images_dir, image_filename)
        
        # Skip if destination already exists to avoid overwriting
        if os.path.exists(dest_image_path):
            print(f"Skipping {image_filename} - already exists")
            copied_count += 1
            continue
        
        try:
            if os.path.exists(full_source_image_path):
                shutil.copy2(full_source_image_path, dest_image_path)
                copied_count += 1
                if copied_count % 1000 == 0:  # Progress update every 1000 files
                    print(f"Copied {copied_count} images so far...")
            else:
                print(f"Warning: Source image not found: {full_source_image_path}")
                failed_count += 1
        except Exception as e:
            print(f"Error copying {full_source_image_path}: {e}")
            failed_count += 1

print(f"Successfully copied {copied_count} images")
print(f"Failed to copy {failed_count} images")

Copying images to subset_images directory...
Copied 1000 images so far...
Copied 2000 images so far...
Copied 3000 images so far...
Copied 4000 images so far...
Copied 5000 images so far...
Copied 6000 images so far...
Copied 7000 images so far...
Copied 8000 images so far...
Copied 9000 images so far...
Copied 10000 images so far...
Copied 11000 images so far...
Copied 12000 images so far...
Copied 13000 images so far...
Copied 14000 images so far...
Copied 15000 images so far...
Successfully copied 15000 images
Failed to copy 0 images


In [2]:
# open subset_images directory and count the number of files
import os

subset_images_dir = "subset_images"

# Count the number of files in the directory
num_files = len(os.listdir(subset_images_dir))
print(f"Number of files in {subset_images_dir}: {num_files}")

Number of files in subset_images: 15000


In [3]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "CLEVR_int_only_15K_v1_subset.jsonl"
# Create final JSONL with absolute image paths
final_output_file = "CLEVR_run1_int_only.jsonl"
base_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/CLEVR-MATH"
absolute_subset_images_dir = os.path.join(base_dir, "subset_images")

print(f"Creating final JSONL with absolute image paths: {final_output_file}")

with open(subset_output_file, 'r') as input_f, open(final_output_file, 'w') as output_f:
    for line in input_f:
        record = json.loads(line.strip())
        
        # Extract original filename
        image_filename = os.path.basename(record["image"])
        
        # Create absolute path to the copied image
        absolute_image_path = os.path.join(absolute_subset_images_dir, image_filename)
        
        # Add the image_path field
        record["image_path"] = absolute_image_path
        
        # Write the updated record
        output_f.write(json.dumps(record) + "\n")

print(f"Created final output file: {final_output_file}")

# Show sample of final output
with open(final_output_file, 'r') as f:
    sample_record = json.loads(f.readline())
    print("Sample final record:")
    print(json.dumps(sample_record, indent=2))

Creating final JSONL with absolute image paths: CLEVR_run1_int_only.jsonl
Created final output file: CLEVR_run1_int_only.jsonl
Sample final record:
{
  "image": "MMPR-v1.2-prompts/correctness_images/CLEVR_math_en_20240402_extracted/CLEVR_v1.0/images/train/CLEVR_train_029709.png",
  "question": "Subtract all cylinders. How many objects are left?\nAnswer the question using a single word or phrase.",
  "answer": "3",
  "uid": "ab9dfdb6-8982-4c31-af96-7f08b52fbc0e",
  "image_path": "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/CLEVR-MATH/subset_images/CLEVR_train_029709.png"
}
