In [3]:
import json
import random
import uuid
import os
import shutil
from pathlib import Path
import pandas as pd
from collections import defaultdict

print("Imports completed successfully")

Imports completed successfully


In [4]:
# Load the full JSONL file
input_file = "full_ai2d_train_12k_en_20240410_extracted.jsonl"
data = []

print(f"Loading data from {input_file}...")
with open(input_file, 'r') as f:
    for line in f:
        data.append(json.loads(line.strip()))

print(f"Loaded {len(data)} records")
print("Sample record:", data[0])

Loading data from full_ai2d_train_12k_en_20240410_extracted.jsonl...
Loaded 12413 records
Sample record: {'image': 'MMPR-v1.2-prompts/correctness_images/ai2d_train_12k_en_20240410_extracted/images/4781.png', 'question': 'How many flowers are shown?\n1\n3\n2\n4\nPlease answer the question based on the options mentioned before.', 'answer': '2'}


In [10]:
# Group by unique images to ensure no duplicates
unique_images = {}
for record in data:
    image_path = record["image"]
    if image_path not in unique_images:
        unique_images[image_path] = record

print(f"Found {len(unique_images)} unique images")

Found 3976 unique images


In [5]:
# Group by unique images to ensure no duplicates
unique_questions = {}
for record in data:
    question = record["question"]
    if question not in unique_questions:
        unique_questions[question] = record

print(f"Found {len(unique_questions)} unique questions")

Found 12409 unique questions


In [6]:
# Sample 15K unique records
sample_size = 12000
unique_records = list(unique_questions.values())

if len(unique_records) < sample_size:
    print(f"Warning: Only {len(unique_records)} unique images available, less than requested {sample_size}")
    sampled_data = unique_records
else:
    # Use random.sample for sampling without replacement
    random.seed(42)  # For reproducibility
    sampled_data = random.sample(unique_records, sample_size)

print(f"Selected {len(sampled_data)} records for subset")

Selected 12000 records for subset


In [7]:
# Add UUID to each record and save subset
subset_output_file = "ai2d_open_ans_12K_v1_subset.jsonl"

print("Adding UIDs and saving subset...")
with open(subset_output_file, 'w') as f:
    for record in sampled_data:
        # Add UID to the record
        record_with_uid = record.copy()
        record_with_uid["uid"] = str(uuid.uuid4())
        
        # Write to file
        f.write(json.dumps(record_with_uid) + "\n")

print(f"Saved {len(sampled_data)} records to {subset_output_file}")
print("Sample record with UID:", json.loads(open(subset_output_file).readline()))

Adding UIDs and saving subset...
Saved 12000 records to ai2d_open_ans_12K_v1_subset.jsonl
Sample record with UID: {'image': 'MMPR-v1.2-prompts/correctness_images/ai2d_train_12k_en_20240410_extracted/images/4561.png', 'question': 'What object is shown in the diagram above?\nOrange tree\nMint Plant\nNone of the above\nApple Tree\nPlease answer the question based on the options mentioned before.', 'answer': 'Mint Plant', 'uid': 'be7ce904-2513-4655-9653-d147db2280c7'}


In [8]:
source_image_dir = "/data/users/brandon/ob1-projects/MMPR-v1.2-prompts"
loaded_image_path = "MMPR-v1.2-prompts/correctness_images/ai2d_train_12k_en_20240410_extracted/images/4561.png"

source_image_path = "/".join(loaded_image_path.split("/")[1::])
full_source_image_path = os.path.join(source_image_dir, source_image_path)

print("full_source_image_path: ", full_source_image_path)
# print("loaded_image_path: ", loaded_image_path)


image_filename = os.path.basename(loaded_image_path)
print("image_filename: ", image_filename)

full_source_image_path:  /data/users/brandon/ob1-projects/MMPR-v1.2-prompts/correctness_images/ai2d_train_12k_en_20240410_extracted/images/4561.png
image_filename:  4561.png


In [9]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "ai2d_open_ans_12K_v1_subset.jsonl"
subset_images_dir = "subset_images"
os.makedirs(subset_images_dir, exist_ok=True)

print(f"Copying images to {subset_images_dir} directory...")

# Read the subset file to get the image paths
copied_count = 0
failed_count = 0

with open(subset_output_file, 'r') as f:
    for line in f:
        record = json.loads(line.strip())
        source_image_path = "/".join(record["image"].split("/")[1::])

        source_image_dir = "/data/users/brandon/ob1-projects/MMPR-v1.2-prompts"

        full_source_image_path = os.path.join(source_image_dir, source_image_path)

        image_filename = os.path.basename(source_image_path)
        
        dest_image_path = os.path.join(subset_images_dir, image_filename)
        
        # Skip if destination already exists to avoid overwriting
        if os.path.exists(dest_image_path):
            print(f"Skipping {image_filename} - already exists")
            copied_count += 1
            continue
        
        try:
            if os.path.exists(full_source_image_path):
                shutil.copy2(full_source_image_path, dest_image_path)
                copied_count += 1
                if copied_count % 1000 == 0:  # Progress update every 1000 files
                    print(f"Copied {copied_count} images so far...")
            else:
                print(f"Warning: Source image not found: {full_source_image_path}")
                failed_count += 1
        except Exception as e:
            print(f"Error copying {full_source_image_path}: {e}")
            failed_count += 1

print(f"Successfully copied {copied_count} images")
print(f"Failed to copy {failed_count} images")

Copying images to subset_images directory...
Skipping 2098.png - already exists
Skipping 136.png - already exists
Skipping 1277.png - already exists
Skipping 493.png - already exists
Skipping 2585.png - already exists
Skipping 1986.png - already exists
Skipping 1775.png - already exists
Skipping 2248.png - already exists
Skipping 1769.png - already exists
Skipping 1295.png - already exists
Skipping 1776.png - already exists
Skipping 4845.png - already exists
Skipping 1989.png - already exists
Skipping 809.png - already exists
Skipping 494.png - already exists
Skipping 799.png - already exists
Skipping 929.png - already exists
Skipping 482.png - already exists
Skipping 1769.png - already exists
Skipping 907.png - already exists
Skipping 515.png - already exists
Skipping 416.png - already exists
Skipping 1801.png - already exists
Skipping 488.png - already exists
Skipping 244.png - already exists
Skipping 1043.png - already exists
Skipping 1949.png - already exists
Skipping 1760.png - al

In [10]:
# open subset_images directory and count the number of files
import os

subset_images_dir = "subset_images"

# Count the number of files in the directory
num_files = len(os.listdir(subset_images_dir))
print(f"Number of files in {subset_images_dir}: {num_files}")

Number of files in subset_images: 3230


In [11]:
import os
import json
import shutil

# Copy images to subset_images directory
subset_output_file = "ai2d_open_ans_12K_v1_subset.jsonl"
# Create final JSONL with absolute image paths
final_output_file = "ai2d_run1_open_ans_12K_v1_subset.jsonl"
base_dir = "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/AI2D"
absolute_subset_images_dir = os.path.join(base_dir, "subset_images")

print(f"Creating final JSONL with absolute image paths: {final_output_file}")

with open(subset_output_file, 'r') as input_f, open(final_output_file, 'w') as output_f:
    for line in input_f:
        record = json.loads(line.strip())
        
        # Extract original filename
        image_filename = os.path.basename(record["image"])
        
        # Create absolute path to the copied image
        absolute_image_path = os.path.join(absolute_subset_images_dir, image_filename)
        
        # Add the image_path field
        record["image_path"] = absolute_image_path
        
        # Write the updated record
        output_f.write(json.dumps(record) + "\n")

print(f"Created final output file: {final_output_file}")

# Show sample of final output
with open(final_output_file, 'r') as f:
    sample_record = json.loads(f.readline())
    print("Sample final record:")
    print(json.dumps(sample_record, indent=2))

Creating final JSONL with absolute image paths: ai2d_run1_open_ans_12K_v1_subset.jsonl
Created final output file: ai2d_run1_open_ans_12K_v1_subset.jsonl
Sample final record:
{
  "image": "MMPR-v1.2-prompts/correctness_images/ai2d_train_12k_en_20240410_extracted/images/4561.png",
  "question": "What object is shown in the diagram above?\nOrange tree\nMint Plant\nNone of the above\nApple Tree\nPlease answer the question based on the options mentioned before.",
  "answer": "Mint Plant",
  "uid": "be7ce904-2513-4655-9653-d147db2280c7",
  "image_path": "/data/users/brandon/ob1-projects/InternVL/internvl_chat/rollout_generation/preprocessed_prompts/preprocessing_scripts/AI2D/subset_images/4561.png"
}
