In [1]:
import os
import shutil
import random
import csv
from itertools import cycle
from concurrent.futures import ThreadPoolExecutor
import numpy as np 
import pandas as pd
from collections import defaultdict


In [2]:
np.random.seed(1234)

In [24]:
def rename_filename(old_filename, datasets, new_dir_name):
    """
    Rename the file by replacing the dataset name with the directory name.

    :pram old_filename (str): The original filename.
    :pram datasets (list[str]): List of dataset names.
    :pram new_dir_name (str): Name of the new directory.
    :return (str): The renamed file.
    """
    for dataset in datasets:
        if dataset in old_filename:
            return old_filename.replace(dataset, new_dir_name)
    return old_filename


def create_new_directories(base_new_dir, class_structure):
    """
    Create the required new directories if they don't exist.

    :pram base_new_dir (str): The base directory where new directories should be created.
    :pram class_structure (str): path structure for class-specific directories.
    :return: list[str]: List of new directory paths.
    """
    directory_names = ["One", "Two", "Three", "Four"]
    new_dirs = [os.path.join(base_new_dir, name, class_structure) for name in directory_names]
    for new_dir in new_dirs:
        os.makedirs(new_dir, exist_ok=True)
    return new_dirs


def collect_image_sources(base_directory, datasets, class_structure):
    """
    Collect all image paths from the source directories.

    :pram base_directory (str): Base directory to search from.
    :pram datasets (list[str]): List of dataset names.
    :pram class_structure (str): path structure for class-specific directories.
    :return: list[str]: List of all image source paths.
    """
    all_image_sources = []
    for dataset in datasets:
        dir_path = os.path.join(base_directory, dataset, class_structure)
        image_sources = [os.path.join(dir_path, img_name) for img_name in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, img_name))]
        all_image_sources.extend(image_sources)
        print(f"For {dataset}, {class_structure.split('/')[0]}, {class_structure.split('/')[1]}: {len(image_sources)} files.")
    return all_image_sources


def distribute_files_and_log(all_image_sources, datasets, new_dirs):
    """
    Distribute files across the new directories and log the changes.

    :pram all_image_sources (list[str]): List of all image source paths.
    :pram datasets (list[str]): List of dataset names.
    :pram new_dirs (list[str]): List of destination directories.
    """
    logs = []

    #seed before shuffle random.seed
    random.shuffle(all_image_sources)

    num_files = len(all_image_sources)
    files_per_directory = num_files // 4
    leftover_files = num_files % 4
    index = 0

    directory_names = ["One", "Two", "Three", "Four"]

    for dir_num, new_dir in enumerate(new_dirs):
        limit = files_per_directory
        if dir_num < leftover_files:
            limit += 1

        for _ in range(limit):
            img_path = all_image_sources[index]
            old_filename = os.path.basename(img_path)
            new_filename = rename_filename(old_filename, datasets, directory_names[dir_num])
            new_path = os.path.join(new_dir, new_filename)
            shutil.copy2(img_path, new_path)
            logs.append((img_path, os.path.dirname(img_path), new_dir))
            index += 1

        print(f"{new_dir}: {len(os.listdir(new_dir))} files.")

    # Logging
    base_new_dir = os.path.dirname(os.path.dirname(new_dirs[0]))
    csv_dir = os.path.join("/ssd2/pipeline/", "csv/file_log/split1")
    os.makedirs(csv_dir, exist_ok=True)
    class_structure = os.path.relpath(new_dirs[0], base_new_dir)
    log_file_path = os.path.join(csv_dir, f"file_log_{class_structure.replace('/', '_')}.csv")
    with open(log_file_path, "w", newline='') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(["Original File", "Original Directory", "New Directory"])
        writer.writerows(logs)


def random_split(base_directory, class_label):
    """
    Randomly splits and copies files from source directories of a specific class
    to 4 new directories, maintaining their class structure.

    :pram base_directory (str): Base directory containing datasets.
    :pram class_label (tuple[str, str]): Tuple of class labels.
    """
    datasets = ["NIH", "CheXpert", "MIMIC", "PadChest"]
    base_new_dir = "/ssd2/pipeline/datasets/620_file_size/updated/split1_random"
    class_structure = os.path.join(*class_label)
    new_dirs = create_new_directories(base_new_dir, class_structure)
    all_image_sources = collect_image_sources(base_directory, datasets, class_structure)
    distribute_files_and_log(all_image_sources, datasets, new_dirs)

In [4]:
base_directory = "/ssd2/pipeline/datasets/620_file_size/split3_dataset"
class_labels = [
    ("Cardiomegaly", "PA"),
    ("No-Finding", "PA"),
]

# Using random_split2_patient
for class_label in class_labels:
    random_split(base_directory, class_label)


For NIH, Cardiomegaly, PA: 620 files.
For CheXpert, Cardiomegaly, PA: 620 files.
For MIMIC, Cardiomegaly, PA: 620 files.
For PadChest, Cardiomegaly, PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/One/Cardiomegaly/PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/Two/Cardiomegaly/PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/Three/Cardiomegaly/PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/Four/Cardiomegaly/PA: 620 files.
For NIH, No-Finding, PA: 620 files.
For CheXpert, No-Finding, PA: 620 files.
For MIMIC, No-Finding, PA: 620 files.
For PadChest, No-Finding, PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/One/No-Finding/PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/Two/No-Finding/PA: 620 files.
/ssd2/pipeline/datasets/620_file_size/updated/split1_random/Three/No-Finding/PA: 620 files.
/ssd2/pipeline/datasets/620_file_size

In [31]:
def extract_patientinfo(filename):
    """
    Extracts the patient info from the filename
    
    :param filename: The filename from which to extract the patient info 
    :return: The extracted patient info.
    """
    # Example filename: "112559_MIMIC_Cardiomegaly_PA_19598446.png"
    parts = filename.split('_')
    return parts[-1].split('.')[0], parts[2], parts[3] 


def collect_image_sources(base_directory, datasets, class_structures):
    """
    Collect all image paths from the source directories.

    :param base_directory (str): Base directory to search from.
    :param datasets (list[str]): List of dataset names.
    :param class_structures (list[str]): List of path structures for class-specific directories.
    :return: list[str]: List of all image source paths.
    """
    all_image_sources = []
    for class_structure in class_structures:
        for dataset in datasets:
            dir_path = os.path.join(base_directory, dataset, class_structure)
            image_sources = [os.path.join(dir_path, img_name) for img_name in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, img_name))]
            all_image_sources.extend(image_sources)
            print(f"For {dataset}, {class_structure.split(os.path.sep)[0]}, {class_structure.split(os.path.sep)[1]}: {len(image_sources)} files.")
    return all_image_sources

def create_new_directories(base_new_dir, class_structures):
    """
    Create the required new directories for each class structure within each fold.

    :param base_new_dir (str): The base directory where new directories should be created.
    :param class_structures (list[str]): List of path structures for class-specific directories.
    :return: list[str]: List of new directory paths.
    """
    directory_names = ["One", "Two", "Three", "Four"]
    new_dirs = []
    for directory_name in directory_names:
        for class_structure in class_structures:
            new_dir = os.path.join(base_new_dir, directory_name, class_structure)
            os.makedirs(new_dir, exist_ok=True)
            new_dirs.append(new_dir)
    return new_dirs

def get_directory_for_image(base_new_dir, dest_dir_name, patient_class, patient_view):
    """
    Construct the complete directory path for storing the image.

    :param base_new_dir (list): List of pre-created directory paths.
    :param dest_dir_name (str): Base directory name like "One", "Two", etc.
    :param patient_class (str): The class of the patient (e.g., 'Cardiomegaly').
    :param patient_view (str): The view of the image (e.g., 'PA').
    :return: str: The complete directory path.
    """
    directory_suffix = f"{dest_dir_name}/{patient_class}/{patient_view}"
    for dir_path in base_new_dir:
        if directory_suffix in dir_path:
            return dir_path
    raise ValueError("No matching directory found for the given base name, class, and view.")



def distribute_files_and_log_patient(all_image_sources, datasets, base_new_dir):
    patientid_to_paths = defaultdict(lambda: {'Paths': [], 'Class': [], 'View': []})
    for img_path in all_image_sources:
        patientinfo = extract_patientinfo(os.path.basename(img_path))
        patient_id = patientinfo[0]
        patientid_to_paths[patient_id]['Paths'].append(img_path)
        patientid_to_paths[patient_id]['Class'].append(patientinfo[1])
        patientid_to_paths[patient_id]['View'].append(patientinfo[2])

    directory_patient_map = {}
    directory_names = ["One", "Two", "Three", "Four"]
    dir_file_counts = {dir_name: 0 for dir_name in directory_names}
    logs_by_class = defaultdict(list)

    for patient_id, info in patientid_to_paths.items():
        if patient_id not in directory_patient_map:
            selected_dir = min(dir_file_counts, key=lambda k: dir_file_counts[k])
            directory_patient_map[patient_id] = selected_dir
            dir_file_counts[selected_dir] += len(info['Paths'])

        for idx, img_path in enumerate(info['Paths']):
            class_label = info['Class'][idx]
            view_label = info['View'][idx]
            # Determine the destination directory for each image based on its class
            dest_dir_base = get_directory_for_image(base_new_dir, directory_patient_map[patient_id], class_label, view_label)
            os.makedirs(dest_dir_base, exist_ok=True)

            old_filename = os.path.basename(img_path)
            new_filename = rename_filename(old_filename, datasets, directory_patient_map[patient_id])
            new_dir = os.path.join(dest_dir_base, new_filename)
            shutil.copy2(img_path, new_dir)
            logs_by_class[class_label].append((img_path, os.path.dirname(img_path), new_dir))

    # Logging separated by class
    for class_label, logs in logs_by_class.items():
        csv_dir = os.path.join("/ssd2/pipeline/", "csv/file_log/split2")
        os.makedirs(csv_dir, exist_ok=True)
        log_file_path = os.path.join(csv_dir, f"file_log_{class_label}_{view_label}.csv")
        with open(log_file_path, "w", newline='') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(["Original File", "Original Directory", "New Directory"])
            writer.writerows(logs)


def random_split2(base_directory, class_labels):
    datasets = ["NIH", "CheXpert", "MIMIC", "PadChest"]
    base_new_dir = "/ssd2/pipeline/datasets/620_file_size/updated/split2_patient"

    # Create a list of class structures
    class_structures = [os.path.join(class_label[0], class_label[1]) for class_label in class_labels]
    new_dirs = create_new_directories(base_new_dir, class_structures)

    all_image_sources = collect_image_sources(base_directory, datasets, class_structures)
    distribute_files_and_log_patient(all_image_sources,datasets, new_dirs)

# Example of how to call the function with class labels
class_labels = [
    ("Cardiomegaly", "PA"),
    ("No-Finding", "PA"),
]
random_split2("/ssd2/pipeline/datasets/620_file_size/split3_dataset", class_labels)


For NIH, Cardiomegaly, PA: 620 files.
For CheXpert, Cardiomegaly, PA: 620 files.
For MIMIC, Cardiomegaly, PA: 620 files.
For PadChest, Cardiomegaly, PA: 620 files.
For NIH, No-Finding, PA: 620 files.
For CheXpert, No-Finding, PA: 620 files.
For MIMIC, No-Finding, PA: 620 files.
For PadChest, No-Finding, PA: 620 files.


In [None]:
import pandas as pd

def load_csv_and_analyze(*file_paths):
    # Load and concatenate CSV files
    data_frames = [pd.read_csv(path) for path in file_paths]
    data = pd.concat(data_frames, ignore_index=True)
    
    # Information extraction
    data['ID'] = data['Original File'].apply(lambda x: x.split('_')[-1].split('.')[0])
    data['Dataset'] = data['Original File'].apply(lambda x: x.split('/')[6])
    data['Class'] = data['Original File'].apply(lambda x: x.split('/')[7])
    data['New Folder Name'] = data['New Directory'].apply(lambda x: x.split('/split2_patient/')[1].split('/')[0])
    data['New ID'] = data['ID'].astype(str) + "_" + data['Dataset'].astype(str)

    
    # Count total number of images
    total_images = data.shape[0]

    datasubset = data[['New ID','ID','Class', 'Dataset', 'New Folder Name']]
    # Count total number of Patient Counts
    total_unique_ids = data['New ID'].nunique()
    
    # Calculate the number of IDs that appear more than once
    id_counts = data['New ID'].value_counts()
    non_unique_ids = id_counts[id_counts > 1]
    
    # Calculate the total instances of IDs that are not unique
    count_of_non_unique_ids = non_unique_ids.count()
    
    
     # Count Patient Counts by various groupings
    ids_by_class = data.groupby(['Class'])['New ID'].nunique().reset_index(name='Patient Counts')
    ids_by_dataset = data.groupby(['Dataset'])['New ID'].nunique().reset_index(name='Patient Counts')
    ids_by_folder = data.groupby(['New Folder Name'])['New ID'].nunique().reset_index(name='Patient Counts')
    ids_by_class_dataset = data.groupby(['Class','Dataset'])['New ID'].nunique().reset_index(name='Patient Counts')
    ids_by_class_dataset_folder = data.groupby(['Class', 'Dataset', 'New Folder Name'])['New ID'].nunique().reset_index(name='Patient Counts')
    
    
    # Analysis for each dataset in each folder
    dataset_in_folder_analysis = data.groupby(['New Folder Name', 'Dataset', 'Class']).size().reset_index(name='Image Count')
    

    # Filter for 'Cardiomegaly' and 'No-Finding' and re-calculate the counts
    filtered_data = dataset_in_folder_analysis[dataset_in_folder_analysis['Class'].isin(['Cardiomegaly', 'No-Finding'])]
    
    # Create a pivot table for the new analysis
    dataset_and_class_in_folder_analysis = filtered_data.pivot_table(
        index=['New Folder Name', 'Dataset'],
        columns='Class',
        values='Image Count',
        fill_value=0,  # Fill missing values with 0
        aggfunc='sum'  # Ensure correct aggregation
    ).reset_index()
    
    # Flatten the column hierarchy for the new analysis DataFrame
    dataset_and_class_in_folder_analysis.columns = ['New Folder Name', 'Dataset', 'Cardiomegaly', 'No-Finding']
    
    
    # Define output directory and ensure it exists
    output_directory = f'/ssd/averijordan/csv/split2_patient/'
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    class_data_folder = f'{output_directory}dataset_and_class_in_folder_analysis.csv'
    ids_by_class.to_csv(f'{output_directory}ids_by_class.csv', index=False)
    ids_by_dataset.to_csv(f'{output_directory}ids_by_dataset.csv', index=False)
    ids_by_folder.to_csv(f'{output_directory}ids_by_folder.csv', index=False)
    ids_by_class_dataset.to_csv(f'{output_directory}ids_by_class_dataset.csv', index=False)
    ids_by_class_dataset_folder.to_csv(f'{output_directory}ids_by_class_dataset_folder.csv', index=False)
    datasubset.to_csv(f'{output_directory}datasubset.csv', index=False)

    
    dataset_and_class_in_folder_analysis.to_csv(class_data_folder, index=False)
    
    # Return paths to both CSV files along with other stats
    return {
        'Total Images': total_images,
        'Total Patient Counts': total_unique_ids,
        'Count of Non-Patient Counts': count_of_non_unique_ids,
        'New Dataset and Class in Folder Analysis CSV File Path': class_data_folder,
        'CSV File Path for IDs by Class': f'{output_directory}ids_by_class.csv',
        'CSV File Path for IDs by Dataset': f'{output_directory}ids_by_dataset.csv',
        'CSV File Path for IDs by Folder': f'{output_directory}ids_by_folder.csv',
        'CSV File Path for IDs by Class, Dataset, and Folder': f'{output_directory}ids_by_class_dataset_folder.csv'
    }

In [None]:
file_paths = [
'/ssd2/pipeline/csv/file_log/split2/file_log_Cardiomegaly_PA.csv',
'/ssd2/pipeline/csv/file_log/split2/file_log_No-Finding_PA.csv'
]
results = load_csv_and_analyze(*file_paths)
print("Overall Results:")
print("Total number of images:", results['Total Images'])
print("Total number of Patient Counts:", results['Total Patient Counts'])
print("Number of IDs that appear more than once:", results['Count of Non-Patient Counts'])



In [None]:
file_paths = [
'/ssd2/pipeline/csv/file_log/split1/file_log_Cardiomegaly_PA.csv',
'/ssd2/pipeline/csv/file_log/split1/file_log_No-Finding_PA.csv'
]
results = load_csv_and_analyze(*file_paths)
print("Overall Results:")
print("Total number of images:", results['Total Images'])
print("Total number of unique IDs:", results['Total Unique IDs'])
print("Number of IDs that appear more than once:", results['Count of Non-Unique IDs'])
