In [9]:
def image_file_generator(images_dir, metadata_df):
    """
    Generator function that yields image paths if the image is present
    in the metadata DataFrame.

    :param images_dir: Directory containing images
    :param metadata_df: DataFrame containing metadata with 'Image Index' column
    :yield: Full path to the image file
    """
    for image_name in metadata_df['Image Index']:
        image_path = os.path.join(images_dir, image_name)
        if os.path.isfile(image_path):
            yield image_path


In [10]:
# Use the generator to create a list of training and test image files
train_image_files = list(image_file_generator(images_dir, train_metadata))
test_image_files = list(image_file_generator(images_dir, test_metadata))

In [11]:
import os
import shutil
from concurrent.futures import ThreadPoolExecutor, as_completed

def organize_images(metadata_df, source_dir, target_dir):
    # Create a dictionary for faster lookups
    label_dict = {os.path.basename(row['Image Index']): 'Pneumothorax' if 'Pneumothorax' in row['Finding Labels'] else 'No_Finding'
                  for index, row in metadata_df.iterrows()}

    # Define a function for copying a single file
    def copy_file(image_path):
        label = label_dict[os.path.basename(image_path)]
        label_dir = label
        dest_path = os.path.join(target_dir, label_dir, os.path.basename(image_path))
        shutil.copy(image_path, dest_path)
        return image_path

    # Use ThreadPoolExecutor for multithreading
    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        # Submit all copy tasks and get a list of futures
        futures = [executor.submit(copy_file, image_path) for image_path in image_file_generator(source_dir, metadata_df)]
        
        # Progress indicator
        total_files = len(futures)
        processed_files = 0
        for future in as_completed(futures):
            processed_files += 1
            if processed_files % 100 == 0:  # Update every 100 files, adjust this number as needed
                print(f"Processed {processed_files}/{total_files} files.")

# Organize training images
organize_images(train_metadata, images_dir, os.path.join(dataset_folder_path, 'data/train'))

# Organize validation images
organize_images(test_metadata, images_dir, os.path.join(dataset_folder_path, 'data/val'))


Processed 100/48791 files.
Processed 200/48791 files.
Processed 300/48791 files.
Processed 400/48791 files.
Processed 500/48791 files.
Processed 600/48791 files.
Processed 700/48791 files.
Processed 800/48791 files.
Processed 900/48791 files.
Processed 1000/48791 files.
Processed 1100/48791 files.
Processed 1200/48791 files.
Processed 1300/48791 files.
Processed 1400/48791 files.
Processed 1500/48791 files.
Processed 1600/48791 files.
Processed 1700/48791 files.
Processed 1800/48791 files.
Processed 1900/48791 files.
Processed 2000/48791 files.
Processed 2100/48791 files.
Processed 2200/48791 files.
Processed 2300/48791 files.
Processed 2400/48791 files.
Processed 2500/48791 files.
Processed 2600/48791 files.
Processed 2700/48791 files.
Processed 2800/48791 files.
Processed 2900/48791 files.
Processed 3000/48791 files.
Processed 3100/48791 files.
Processed 3200/48791 files.
Processed 3300/48791 files.
Processed 3400/48791 files.
Processed 3500/48791 files.
Processed 3600/48791 files.
P

Processed 28900/48791 files.
Processed 29000/48791 files.
Processed 29100/48791 files.
Processed 29200/48791 files.
Processed 29300/48791 files.
Processed 29400/48791 files.
Processed 29500/48791 files.
Processed 29600/48791 files.
Processed 29700/48791 files.
Processed 29800/48791 files.
Processed 29900/48791 files.
Processed 30000/48791 files.
Processed 30100/48791 files.
Processed 30200/48791 files.
Processed 30300/48791 files.
Processed 30400/48791 files.
Processed 30500/48791 files.
Processed 30600/48791 files.
Processed 30700/48791 files.
Processed 30800/48791 files.
Processed 30900/48791 files.
Processed 31000/48791 files.
Processed 31100/48791 files.
Processed 31200/48791 files.
Processed 31300/48791 files.
Processed 31400/48791 files.
Processed 31500/48791 files.
Processed 31600/48791 files.
Processed 31700/48791 files.
Processed 31800/48791 files.
Processed 31900/48791 files.
Processed 32000/48791 files.
Processed 32100/48791 files.
Processed 32200/48791 files.
Processed 3230