In [1]:
import csv
import os
import random
import shutil


def extract_info_from_csv(main_folder, phase, output_dir, num_images=None, shuffle=True):
    """
    Extracts information from a CSV file and returns it as a list of dictionaries.
    :param csv_path: Path to the CSV file
    :param num_images: Number of images to extract, if None extracts all
    :return: List of dictionaries containing image details
    """

    data = []
    csv_path = os.path.join(main_folder, phase, f'radiology{phase}data.csv')
    with open(csv_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            img_id, img_path, caption = row
            data.append({
                            img_id: {'image_path': img_path,
                                     'caption': caption.replace("\n", '')}
                        })
    if shuffle:
        random.shuffle(data)
    if num_images is not None:
        data = data[:num_images]
    # generate a directory and copy the dataset
    if output_dir:
        # Create output directory if it doesn't exist
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for entry in data:
            # Construct the new image path
            rad = 'radiology'
            old_img_path = os.path.join(main_folder, phase, rad, 'images', list(entry.values())[0]['image_path'])
            new_img_path = os.path.join(output_dir, phase, rad, 'images', os.path.basename(list(entry.values())[0]['image_path']))
            if not os.path.isfile(old_img_path):
                rad = 'non-radiology'
                old_img_path = os.path.join(main_folder, phase, rad, 'images',
                                            list(entry.values())[0]['image_path'])
                new_img_path = os.path.join(output_dir, phase, rad, 'images', os.path.basename(list(entry.values())[0]['image_path']))
                if not os.path.isfile(old_img_path):
                    print(f"NOTFOUND skipping image {old_img_path}")
                    continue
            # Copy the image
            os.makedirs(os.path.dirname(new_img_path), exist_ok=True)
            shutil.copy(old_img_path, new_img_path)

            # Update the image path in the dictionary to the new path
            entry[list(entry.keys())[0]]['image_path'] = new_img_path

            # Write the captions to the new captions.txt file
            with open(os.path.join(output_dir, phase, rad, 'captions.txt'), 'a', encoding='utf-8') as out_file:
                out_file.write(f"{list(entry.keys())[0]}\t{list(entry.values())[0]['caption']}\n")
    return data

In [2]:
import json


def generate_dataset(main_folder, output_json_path, test_count, train_count, validation_count, output_dir, all_images=False):
    """
    Traverse the main folder, read CSV files, and create a JSON file with image details.
    """

    dataset_info = {
        'train': [],
        'validation': [],
        'test': []
    }
    if all_images:
        test_count, train_count, validation_count = None, None, None
    image_counts = {
        'train': train_count,
        'validation': validation_count,
        'test': test_count
    }
    for phase, count in image_counts.items():
        dataset_info[phase] = extract_info_from_csv(main_folder, phase, num_images=count, output_dir=output_dir)
    with open(output_json_path, 'w') as out_file:
        json.dump(dataset_info, out_file, indent=4)

In [3]:
main_folder = 'ROCO'  # 'Path to the main dataset folder'
output_json = 'selected_dataset/selected_dataset_info.json'  # 'Path to save the generated JSON file'
output_dir = 'selected_dataset'  # 'Path to save the dataset'
test = 50  # 'Number of images to select from test
train = 2000  # 'Number of images to select from train'
valid = 500  # 'Number of images to select from validation'
all = False  # 'Flag to select all images'
generate_dataset(main_folder, output_json, test, train, valid, output_dir, all)

NOTFOUND skipping image ROCO\train\non-radiology\images\PMC4345544_yjbm_88_1_93_g05.jpg
NOTFOUND skipping image ROCO\train\non-radiology\images\PMC4156025_f1000research-3-3454-g0000.jpg
NOTFOUND skipping image ROCO\validation\non-radiology\images\PMC3277920_PHLEB-10-100-g2.jpg
