In [4]:
import os
import json
import random

# Path to the JSON file
json_file_path = '/workspaces/finetune/AFINAL/clip/clip_blip.json'

# Function to extract all image paths and captions from the JSON file
def extract_image_paths_and_captions_by_class(json_file_path):
    class_image_data = {}
    if not os.path.exists(json_file_path):
        print(f"JSON file not found at: {json_file_path}")
    else:
        with open(json_file_path, 'r') as f:
            data = json.load(f)

        for record in data:
            class_name = record["class"]
            caption = record["caption"]
            if class_name not in class_image_data:
                class_image_data[class_name] = []
            for image_path in record["image_paths"]:
                class_image_data[class_name].append({"image_path": image_path, "caption": caption, "class": class_name})
    return class_image_data

# Function to split image paths into train, val, and test sets
def split_image_data(class_image_data):
    train_data = []
    val_data = []
    test_data = []

    for class_name, image_data in class_image_data.items():
        # Shuffle the image data to ensure randomness
        random.shuffle(image_data)
        
        # Calculate split indices
        total_images = len(image_data)
        train_split = int(0.6 * total_images)
        val_split = int(0.2 * total_images)
        test_split = total_images - train_split - val_split
        
        # Split the data
        train_data.extend(image_data[:train_split])
        val_data.extend(image_data[train_split:train_split + val_split])
        test_data.extend(image_data[train_split + val_split:])
    
    return train_data, val_data, test_data

# Extract image paths and captions by class
class_image_data = extract_image_paths_and_captions_by_class(json_file_path)

# Split image paths and captions into train, val, and test sets
train_data, val_data, test_data = split_image_data(class_image_data)

# Ensure the output directory exists
output_dir = './output/'
os.makedirs(output_dir, exist_ok=True)

# Save the splits to new JSON files
train_json_path = os.path.join(output_dir, 'train_data.json')
val_json_path = os.path.join(output_dir, 'val_data.json')
test_json_path = os.path.join(output_dir, 'test_data.json')

with open(train_json_path, 'w') as f:
    json.dump(train_data, f, indent=4)

with open(val_json_path, 'w') as f:
    json.dump(val_data, f, indent=4)

with open(test_json_path, 'w') as f:
    json.dump(test_data, f, indent=4)

print("Data has been split and saved into train, val, and test JSON files with captions.")


Data has been split and saved into train, val, and test JSON files with captions.


In [5]:
import json

# Load the JSON data
file_path = '/workspaces/finetune/AFINAL/clip/output/test_data.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Count the number of records
num_records = len(data)
num_records


1394

In [6]:
import json

# Load the JSON data
file_path = '/workspaces/finetune/AFINAL/clip/output/val_data.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Count the number of records
num_records = len(data)
num_records


1155

In [7]:
import json

# Load the JSON data
file_path = '/workspaces/finetune/AFINAL/clip/output/train_data.json'

with open(file_path, 'r') as file:
    data = json.load(file)

# Count the number of records
num_records = len(data)
num_records


3589