In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os,sys
sys.path.insert(0,"..")
from glob import glob
import matplotlib as plt
import numpy as np
import pandas as pd
import torch
import torchvision
import torchxrayvision as xrv
import skimage
from torchvision.transforms.functional import to_pil_image
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
import ast

In [3]:
nih_path = '/ssd/averijordan/datasets/ChestXray-NIHCC'
chex_path = '/ssd/averijordan/datasets/CheXpert-v1.0-small'
pad_path = "/ssd2/averijordan/PADCHEST_SJ/image_zips/"
mimic_path = "/ssd2/jpmokc/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/files"

transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(224)])

In [4]:
# Load CheXpert Dataset
d_chexpert = xrv.datasets.CheX_Dataset(imgpath=chex_path,
                                   csvpath="/ssd/averijordan/csv/train.csv",
                                   views=["AP", "PA"], transform=transform,
                                   unique_patients=False
                                   )

# Load NIH Dataset
d_nih = xrv.datasets.NIH_Dataset(imgpath=nih_path,views=["AP", "PA"],transform=transform, unique_patients=False)

In [5]:
# Load PadChest Dataset
d_padchest = xrv.datasets.PC_Dataset(imgpath=pad_path, csvpath="/ssd2/averijordan/PADCHEST_SJ/labels_csv/PADCHEST_chest_x_ray_images_labels_160K_01.02.19.csv", 
                                views = ["PA", "AP"],
                                unique_patients=False)
#Load Mimic Dataset

d_mimic = xrv.datasets.MIMIC_Dataset(imgpath="/", csvpath="/ssd2/jpmokc/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv.gz",
                                      metacsvpath="/ssd2/jpmokc/datasets/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-metadata.csv.gz",
                                      views=['AP', 'PA'],
                                      unique_patients=False)


In [6]:
print(d_chexpert.csv.columns)
print(d_nih.csv.columns)
print(d_padchest.csv.columns)
print(d_mimic.csv.columns)

Index(['Path', 'Sex', 'Age', 'Frontal/Lateral', 'AP/PA', 'No Finding',
       'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
       'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia', 'Atelectasis',
       'Pneumothorax', 'Pleural Effusion', 'Pleural Other', 'Fracture',
       'Support Devices', 'view', 'patientid', 'age_years', 'sex_male',
       'sex_female'],
      dtype='object')
Index(['index', 'Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID',
       'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width',
       'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'view', 'has_masks',
       'patientid', 'age_years', 'sex_male', 'sex_female'],
      dtype='object')
Index(['Unnamed: 0', 'ImageID', 'ImageDir', 'StudyDate_DICOM', 'StudyID',
       'PatientID', 'PatientBirth', 'PatientSex_DICOM', 'ViewPosition_DICOM',
       'Projection', 'MethodProjection', 'Pediatric', 'Modality_DICOM',
       'Manufacturer_DICOM', 'PhotometricInterpretation_

In [7]:
#Create csv subset
d_chexpert_subset = d_chexpert.csv[['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
       'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
       'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
       'Fracture', 'Support Devices', 'patientid', 'Path', 'Frontal/Lateral', 'view']].copy()

#Add file directories and study number collumn
d_chexpert_subset['local_path'] = '/ssd/averijordan/datasets/' + d_chexpert_subset["Path"].astype(str)
d_chexpert_subset['study'] = d_chexpert_subset['Path'].str.extract(r'(study\d+)')


In [8]:
d_chexpert_subset = d_chexpert_subset[d_chexpert_subset['Frontal/Lateral'] != 'Lateral']

In [9]:
def create_filter(df, columns):
    return (df[columns] == 1).any(axis=1)

def create_exclusive_filter(df, columns):
    return (df[columns] == 1).sum(axis=1) == 1


columns_all = ['No Finding', 'Enlarged Cardiomediastinum', 'Cardiomegaly',
           'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation', 'Pneumonia',
           'Atelectasis', 'Pneumothorax', 'Pleural Effusion', 'Pleural Other',
           'Fracture', 'Support Devices']
columns_binary_class_subset = ['No Finding', 'Pleural Effusion']
binary_filter = create_filter(d_chexpert_subset, columns_binary_class_subset)


In [10]:
d_chexpert_subset_binary = d_chexpert_subset[binary_filter == 1]
print(d_chexpert_subset_binary)

        No Finding  Enlarged Cardiomediastinum  Cardiomegaly  Lung Opacity  \
0              1.0                         0.0           0.0           0.0   
5              1.0                         0.0           0.0           0.0   
7              1.0                         0.0           0.0           0.0   
11             1.0                         0.0           0.0           0.0   
14             NaN                         NaN           NaN           1.0   
...            ...                         ...           ...           ...   
223402         1.0                         0.0           0.0           0.0   
223403         NaN                         NaN           1.0           NaN   
223407         NaN                         NaN           NaN           NaN   
223409         NaN                         NaN           NaN          -1.0   
223413         1.0                         0.0           0.0           0.0   

        Lung Lesion  Edema  Consolidation  Pneumonia  Atelectas

In [11]:
d_chexpert_ap = d_chexpert_subset[d_chexpert_subset['view'] == 'AP']
d_chexpert_pa = d_chexpert_subset[d_chexpert_subset['view'] == 'PA']


In [12]:
d_chexpert_pneumonia_ap = d_chexpert_ap[d_chexpert_ap['Pneumonia'] == 1.0]
d_chexpert_cardiomegaly_ap = d_chexpert_ap[d_chexpert_ap['Cardiomegaly'] == 1.0]
d_chexpert_no_finding_ap = d_chexpert_ap[d_chexpert_ap['No Finding'] == 1.0]
d_chexpert_effusion_ap = d_chexpert_ap[d_chexpert_ap['Pleural Effusion'] == 1.0]

d_chexpert_pneumonia_pa = d_chexpert_pa[d_chexpert_pa['Pneumonia'] == 1.0]
d_chexpert_cardiomegaly_pa = d_chexpert_pa[d_chexpert_pa['Cardiomegaly'] == 1.0]
d_chexpert_no_finding_pa = d_chexpert_pa[d_chexpert_pa['No Finding'] == 1.0]
d_chexpert_effusion_pa = d_chexpert_pa[d_chexpert_pa['Pleural Effusion'] == 1.0]


In [13]:
d_chexpert_cardiomegaly_pa

Unnamed: 0,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,patientid,Path,Frontal/Lateral,view,local_path,study
16,,,1.0,,,0.0,,,1.0,,,,,,00009,CheXpert-v1.0-small/train/patient00009/study1/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study1
74,,,1.0,1.0,,0.0,,,1.0,,1.0,,,1.0,00023,CheXpert-v1.0-small/train/patient00023/study8/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study8
134,,,1.0,1.0,,1.0,,,-1.0,,,,,,00044,CheXpert-v1.0-small/train/patient00044/study7/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study7
136,,,1.0,,,1.0,,,1.0,,1.0,,,1.0,00044,CheXpert-v1.0-small/train/patient00044/study6/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study6
187,,,1.0,1.0,,,,,,0.0,1.0,,,1.0,00061,CheXpert-v1.0-small/train/patient00061/study2/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143901,,-1.0,1.0,,,,0.0,,,0.0,0.0,,,,34574,CheXpert-v1.0-small/train/patient34574/study5/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study5
143955,,1.0,1.0,1.0,,,0.0,,,,0.0,0.0,,1.0,34593,CheXpert-v1.0-small/train/patient34593/study1/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study1
178015,,,1.0,,,-1.0,,,,,0.0,,,1.0,41739,CheXpert-v1.0-small/train/patient41739/study1/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study1
188235,,,1.0,1.0,,-1.0,,,,,,-1.0,,,44899,CheXpert-v1.0-small/train/patient44899/study3/...,Frontal,PA,/ssd/averijordan/datasets/CheXpert-v1.0-small/...,study3


In [14]:
d_chexpert_cardiomegaly_ap_file_info = d_chexpert_cardiomegaly_ap[["patientid", "study"]].copy()
d_chexpert_cardiomegaly_pa_file_info = d_chexpert_cardiomegaly_pa[["patientid", "study"]].copy()
d_chexpert_pneumonia_ap_file_info = d_chexpert_pneumonia_ap[["patientid", "study"]].copy()
d_chexpert_pneumonia_pa_file_info = d_chexpert_pneumonia_pa[["patientid", "study"]].copy()
d_chexpert_effusion_ap_file_info = d_chexpert_effusion_ap[["patientid", "study"]].copy()
d_chexpert_effusion_pa_file_info = d_chexpert_effusion_pa[["patientid", "study"]].copy()
d_chexpert_no_finding_ap_file_info = d_chexpert_no_finding_ap[["patientid", "study"]].copy()
d_chexpert_no_finding_pa_file_info = d_chexpert_no_finding_pa[["patientid", "study"]].copy()


In [15]:
d_chexpert_cardiomegaly_ap_file_info.columns = ["unique_id", "study"]
d_chexpert_cardiomegaly_pa_file_info.columns = ["unique_id", "study"]
d_chexpert_pneumonia_ap_file_info.columns = ["unique_id", "study"]
d_chexpert_pneumonia_pa_file_info.columns = ["unique_id", "study"]
d_chexpert_effusion_ap_file_info.columns = ["unique_id", "study"]
d_chexpert_effusion_pa_file_info.columns = ["unique_id", "study"]
d_chexpert_no_finding_ap_file_info.columns = ["unique_id", "study"]
d_chexpert_no_finding_pa_file_info.columns = ["unique_id", "study"]

In [None]:
#Create csv subset
cols = ['Cardiomegaly','No Finding', 'Pleural Effusion',"Pneumonia",'ProcedureCodeSequence_CodeMeaning', 'subject_id', 'study_id', 'dicom_id', 'patientid']
d_mimic_sub = d_mimic.csv[cols].copy()

#Add file directories collumn
d_mimic_sub['file_dir'] = "p" + d_mimic_sub['subject_id'].astype(str).str[:2] + "/p" + d_mimic_sub['subject_id'].astype(str) + "/s" + d_mimic_sub['study_id'].astype(str) + "/"


In [None]:
allowed_ap_views = {'CHEST (PORTABLE AP)',  'CHEST (SINGLE VIEW)'}
allowed_pa_views = {'CHEST (PA AND LAT)'}


d_mimic_ap = d_mimic_sub[d_mimic_sub['ProcedureCodeSequence_CodeMeaning'].isin(allowed_ap_views)]
d_mimic_pa = d_mimic_sub[d_mimic_sub['ProcedureCodeSequence_CodeMeaning'].isin(allowed_pa_views)]

#Create filter for count of labels assignmed
exclusive_filter = d_mimic_sub[['Cardiomegaly','No Finding', 'Pleural Effusion', 'Pneumonia']].sum(axis=1)
binary_filter = d_mimic_sub[['Cardiomegaly','No Finding']].sum(axis=1)

#Apply filter so that only entries with one label are left
d_mimic_ap = d_mimic_ap[binary_filter == 1]
d_mimic_pa = d_mimic_pa[binary_filter == 1]




In [None]:
d_mimic_cardiomegaly_ap = d_mimic_ap[d_mimic_ap['Cardiomegaly'] == 1.0]
d_mimic_no_finding_ap  = d_mimic_ap[d_mimic_ap['No Finding'] == 1.0]

d_mimic_cardiomegaly_pa  = d_mimic_pa[d_mimic_pa['Cardiomegaly'] == 1.0]
d_mimic_no_finding_pa       = d_mimic_pa[d_mimic_pa['No Finding'] == 1.0]


In [None]:
d_mimic_cardiomegaly_ap_file_info  = d_mimic_cardiomegaly_ap[["patientid", "file_dir"]].copy()
d_mimic_cardiomegaly_pa_file_info  = d_mimic_cardiomegaly_pa[["patientid", "file_dir"]].copy()

d_mimic_no_finding_ap_file_info = d_mimic_no_finding_ap[["patientid", "file_dir"]].copy()
d_mimic_no_finding_pa_file_info = d_mimic_no_finding_pa[["patientid", "file_dir"]].copy()



In [None]:
d_mimic_cardiomegaly_ap_file_info.columns = ["unique_id", "file_dir"]
d_mimic_cardiomegaly_pa_file_info.columns = ["unique_id", "file_dir"]
d_mimic_no_finding_ap_file_info.columns = ["unique_id", "file_dir"]
d_mimic_no_finding_pa_file_info.columns = ["unique_id", "file_dir"]

In [16]:
total_finding = d_nih.totals()
pathologies = []
for p in total_finding:
    pathologies.append(p)
pathologies.append('No Finding')
pathologies

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Effusion',
 'Emphysema',
 'Fibrosis',
 'Hernia',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pleural_Thickening',
 'Pneumonia',
 'Pneumothorax',
 'No Finding']

In [17]:
# Seperate columns wanted
d_nih_subset = pd.concat([d_nih.csv["Finding Labels"],d_nih.csv["view"], d_nih.csv["patientid"], d_nih.csv["Image Index"], d_nih.csv["Follow-up #"]], axis=1)
d_nih_subset.rename(columns={'Image Index': 'Image_Index'}, inplace=True)

In [18]:
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm

def locate_file_in_directory(image_name, base_dir):
    """
    Traverses a directory to find a file with the given name.

    :param image_name (str): Name of the image to search for.
    :param base_dir (str): Root directory path to begin the search from.
    :return: str or None: Full path to the file if found, otherwise None.
    """

    for root, _, files in os.walk(base_dir):
        if image_name in files:
            return os.path.join(root, image_name)
    return None

def parallel_file_search(image_names, base_dir):
    """
    Parallelizes the search for multiple files in a directory.

    :param image_names (list[str]): List of image names to search for.
    :param base_dir (str): Root directory path to begin the search from.
    :return: dict: Dictionary mapping image names to their file paths.
    """

    with ProcessPoolExecutor() as executor:
        file_paths = list(tqdm(executor.map(locate_file_in_directory, image_names, [base_dir]*len(image_names)), total=len(image_names)))
    return dict(zip(image_names, file_paths))

# Specify the root directory to initiate the search
root_directory = "/ssd/averijordan/datasets/ChestXray-NIHCC/"

# Assuming d_nih_subset is a pandas DataFrame, extract the list of image names
image_names_list = d_nih_subset['Image_Index'].tolist()

# Check if 'local_path' column exists in the DataFrame, if not, find the paths and add them
if 'local_path' not in d_nih_subset.columns:
    image_to_path_mapping = parallel_file_search(image_names_list, root_directory)
    
    # Map the found paths to a new column in the DataFrame
    d_nih_subset['local_path'] = d_nih_subset['Image_Index'].map(image_to_path_mapping)

# Print the DataFrame (or any other operation)
print(d_nih_subset)


100%|██████████| 112120/112120 [03:01<00:00, 618.94it/s] 

                Finding Labels view patientid       Image_Index  Follow-up #  \
0                 Cardiomegaly   PA         1  00000001_000.png            0   
1       Cardiomegaly|Emphysema   PA         1  00000001_001.png            1   
2        Cardiomegaly|Effusion   PA         1  00000001_002.png            2   
3                   No Finding   PA         2  00000002_000.png            0   
4                       Hernia   PA         3  00000003_001.png            0   
...                        ...  ...       ...               ...          ...   
112115          Mass|Pneumonia   PA     30801  00030801_001.png            1   
112116              No Finding   PA     30802  00030802_000.png            0   
112117              No Finding   PA     30803  00030803_000.png            0   
112118              No Finding   PA     30804  00030804_000.png            0   
112119              No Finding   PA     30805  00030805_000.png            0   

                                       




In [19]:
#Seperate out the bundled inner arrays, thus spliting the finding labels 
d_nih_subset["Finding Labels"] = d_nih_subset["Finding Labels"].apply(lambda x: x.split('|') if isinstance(x, str) else x)

In [20]:
#Chop of the .png if needed
d_nih_subset['Image_Index'] = d_nih_subset['Image_Index'].apply(lambda x: x[:-4] if '.png' in x else x)

In [21]:
# Set the categories that you want to avoid intersections with
categories_to_avoid_multi = {'No Finding', 'Cardiomegaly', 'Effusion', 'Pneumonia'}
categories_to_avoid_binary = {'No Finding', 'Cardiomegaly'}
# Create a mask where each element is True if the 'Finding Labels' list has no intersection with categories_to_avoid, and False otherwise
mask_multi = d_nih_subset['Finding Labels'].apply(lambda x: len(set(x) & categories_to_avoid_multi) <= 1)
mask_binary = d_nih_subset['Finding Labels'].apply(lambda x: len(set(x) & categories_to_avoid_binary) <= 1)

# Use the mask to filter the DataFrame
d_nih_subset_combo = d_nih_subset[mask_multi]
d_nih_subset_binary = d_nih_subset[mask_binary]



In [22]:

# Only keep rows where the "Finding Labels" list contains one element
d_nih_subset_exclusive = d_nih_subset[d_nih_subset["Finding Labels"].apply(len) == 1]

# To make sure the resulting labels are strings and not single-item lists
d_nih_subset_exclusive["Finding Labels"] = d_nih_subset_exclusive["Finding Labels"].apply(lambda x: x[0])


In [23]:
# Create label subset for NIH
d_nih_pneumonia = d_nih_subset_binary[d_nih_subset_binary["Finding Labels"].apply(lambda x: ('Pneumonia' in x))] 
d_nih_cardiomegaly = d_nih_subset_binary[d_nih_subset_binary["Finding Labels"].apply(lambda x: ('Cardiomegaly' in x))]
d_nih_effusion = d_nih_subset_binary[d_nih_subset_binary["Finding Labels"].apply(lambda x: ('Effusion' in x))]
d_nih_no_finding = d_nih_subset_binary[d_nih_subset_binary["Finding Labels"].apply(lambda x : ('No Finding' in x))]

In [24]:
d_nih_pneumonia_ap = d_nih_pneumonia[d_nih_pneumonia['view'] == "AP"]
d_nih_pneumonia_pa = d_nih_pneumonia[d_nih_pneumonia['view'] == "PA"]
d_nih_cardiomegaly_ap = d_nih_cardiomegaly[d_nih_cardiomegaly['view'] == "AP"]
d_nih_cardiomegaly_pa = d_nih_cardiomegaly[d_nih_cardiomegaly['view'] == "PA"]
d_nih_effusion_ap = d_nih_effusion[d_nih_effusion['view'] == "AP"]
d_nih_effusion_pa = d_nih_effusion[d_nih_effusion['view'] == "PA"]
d_nih_no_finding_ap = d_nih_no_finding[d_nih_no_finding['view'] == "AP"]
d_nih_no_finding_pa = d_nih_no_finding[d_nih_no_finding['view'] == "PA"]

In [25]:
d_nih_pneumonia_ap_file_info = d_nih_pneumonia_ap[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_pneumonia_pa_file_info = d_nih_pneumonia_pa[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_cardiomegaly_ap_file_info = d_nih_cardiomegaly_ap[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_cardiomegaly_pa_file_info = d_nih_cardiomegaly_pa[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_effusion_ap_file_info = d_nih_effusion_ap[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_effusion_pa_file_info = d_nih_effusion_pa[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_no_finding_ap_file_info = d_nih_no_finding_ap[["Image_Index", "patientid", "Follow-up #"]].copy()
d_nih_no_finding_pa_file_info = d_nih_no_finding_pa[["Image_Index", "patientid", "Follow-up #"]].copy()

In [26]:
d_nih_pneumonia_ap_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_pneumonia_pa_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_cardiomegaly_ap_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_cardiomegaly_pa_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_effusion_ap_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_effusion_pa_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_no_finding_ap_file_info.columns = ["unique_id", "patientid", "Follow-up #"]
d_nih_no_finding_pa_file_info.columns = ["unique_id", "patientid", "Follow-up #"]

In [27]:
totals = d_padchest.totals()
pathologies = []
for p in totals:
    pathologies.append(p)

pathologies.append('Normal')

In [28]:
#Seperate Collumns we want
d_padchest_subset = pd.concat([d_padchest.csv["Labels"],d_padchest.csv["LabelsLocalizationsBySentence"], d_padchest.csv["patientid"], d_padchest.csv["StudyID"], d_padchest.csv["Projection"], d_padchest.csv["ImageID"], d_padchest.csv["ImageDir"]], axis=1)

#Seperate out the bundled inner arrays 
d_padchest_subset["Labels"] = d_padchest_subset["Labels"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

d_padchest_subset = d_padchest_subset[d_padchest_subset["Labels"].apply(lambda x: any(label.lower() in [s.lower() for s in x] for label in pathologies))]


In [29]:
# Specify the root directory to initiate the search
root_directory = "/ssd2/averijordan/PADCHEST_SJ/image_zips"

# Assuming d_nih_subset is a pandas DataFrame, extract the list of image names
image_names_list = d_padchest_subset['ImageID'].to_list()

# Check if 'local_path' column exists in the DataFrame, if not, find the paths and add them
if 'local_path' not in d_padchest_subset.columns:
    image_to_path_mapping = parallel_file_search(image_names_list, root_directory)
    
    # Map the found paths to a new column in the DataFrame
    d_padchest_subset['local_path'] = d_padchest_subset['ImageID'].map(image_to_path_mapping)

# Print the DataFrame (or any other operation)
print(d_padchest_subset['local_path'])

100%|██████████| 64179/64179 [03:23<00:00, 314.90it/s] 


0         /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
12        /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
14        /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
15        /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
17        /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
                                ...                        
160852    /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160856    /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160857    /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160859    /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160860    /ssd2/averijordan/PADCHEST_SJ/image_zips/image...
Name: local_path, Length: 64179, dtype: object


In [30]:
# Create label subset for pad_chest

# For cardiomegaly, ensure that 'normal' is not present
d_padchest_cardiomegaly= d_padchest_subset[d_padchest_subset["Labels"].apply(lambda x: ('cardiomegaly' in x) and ('normal' not in x))]

# For normal, ensure that 'cardiomegaly' is not present
d_padchest_no_finding = d_padchest_subset[d_padchest_subset["Labels"].apply(lambda x: ('normal' in x) and ('pleural effusion' not in x))]
d_padchest_no_finding["Labels"] = d_padchest_no_finding["Labels"].apply(lambda labels_list: ["No-Finding" if label == "normal" else label for label in labels_list])



In [31]:
d_padchest_cardiomegaly["Labels"] = d_padchest_cardiomegaly["Labels"].apply(lambda labels_list: [label.title() for label in labels_list])
d_padchest_cardiomegaly

Unnamed: 0,Labels,LabelsLocalizationsBySentence,patientid,StudyID,Projection,ImageID,ImageDir,local_path
51,"[Cardiomegaly, Superior Mediastinal Enlargement]","[['cardiomegaly', 'superior mediastinal enlarg...",295924120130301648307550461274041646613,128060267991587422580475423001969971705,PA,128060267991587422580475423001969971705_pjiogk...,0,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
79,"[Cardiomegaly, Hilar Congestion]","[['normal'], ['cardiomegaly', 'loc cardiac'], ...",12258663245396477058286432024466027208,62251368659414281768550499840080640229,PA,62251368659414281768550499840080640229_8jw2hb.png,0,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
105,"[Pulmonary Mass, Pacemaker, Cardiomegaly, Vasc...","[['exclude'], ['pulmonary mass', 'loc right', ...",198136049184807299831101261408154142510,30661727075761817007267292459310975718,PA,30661727075761817007267292459310975718_86nsuj.png,0,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
135,[Cardiomegaly],"[['cardiomegaly', 'loc cardiac'], ['normal'], ...",260637685341270195128895242179862224586,84545123491560549832532130982226930123,PA,84545123491560549832532130982226930123_vvayv1.png,0,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
138,"[Vertebral Degenerative Changes, Unchanged, Ca...","[['vertebral degenerative changes'], ['unchang...",59696987918748412958906826258824219748,284353054075043225622260270287627142906,PA,284353054075043225622260270287627142906_kduagg...,0,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
...,...,...,...,...,...,...,...,...
160677,"[Cardiomegaly, Costophrenic Angle Blunting, Pl...","['cardiomegaly', 'loc cardiac', 'costophrenic ...",288746830925472680586150065871445304509,216840111366964012819207061112010315122422240,PA,216840111366964012819207061112010315122422240_...,49,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160687,"[Aortic Elongation, Supra Aortic Elongation, ...","['aortic elongation', ' supra aortic elongatio...",128778274369799340981019723975607594174,216840111366964012768025509942010172082436382,PA,216840111366964012768025509942010172082436382_...,49,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160753,"[Cardiomegaly, Vertebral Degenerative Changes]","['cardiomegaly', 'loc cardiac', 'vertebral deg...",128724982146182828321751788135810514641,216840111366964012558082906712010091130731554,PA,216840111366964012558082906712010091130731554_...,49,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...
160790,"[Cardiomegaly, Aortic Button Enlargement, Tra...","['cardiomegaly', 'loc cardiac', 'aortic button...",85610966088905831476694917480079804249,216840111366964012768025509942010172084539745,PA,216840111366964012768025509942010172084539745_...,49,/ssd2/averijordan/PADCHEST_SJ/image_zips/image...


In [32]:
#Seperate out views

d_padchest_cardiomegaly_ap = d_padchest_cardiomegaly[d_padchest_cardiomegaly['Projection'] == "AP"]
d_padchest_cardiomegaly_pa = d_padchest_cardiomegaly[d_padchest_cardiomegaly['Projection'] == "PA"]

d_padchest_no_finding_ap = d_padchest_no_finding[d_padchest_no_finding['Projection'] == "AP"]
d_padchest_no_finding_pa = d_padchest_no_finding[d_padchest_no_finding['Projection'] == "PA"]

In [33]:

d_padchest_cardiomegaly_ap_file_info = d_padchest_cardiomegaly_ap[["patientid", "ImageID", "local_path"]].copy()
d_padchest_cardiomegaly_pa_file_info = d_padchest_cardiomegaly_pa[["patientid","ImageID", "local_path"]].copy()

d_padchest_no_finding_ap_file_info = d_padchest_no_finding_ap[["patientid", "ImageID", "local_path"]].copy()
d_padchest_no_finding_pa_file_info = d_padchest_no_finding_pa[["patientid", "ImageID", "local_path"]].copy()


In [34]:

d_padchest_cardiomegaly_ap_file_info.reset_index(drop=True, inplace = True)
d_padchest_cardiomegaly_pa_file_info.reset_index(drop=True, inplace = True)

d_padchest_no_finding_ap_file_info.reset_index(drop=True, inplace = True)
d_padchest_no_finding_pa_file_info.reset_index(drop=True, inplace = True)

In [35]:
def export_dataframe_to_csv(dataframe, dest_path, filename):
    """
    Saves a given dataframe as a CSV file in a subdirectory determined by the filename.

    :param dataframe (pandas.DataFrame): DataFrame to be exported.
    :param dest_path (str): Root directory path for saving the CSV.
    :param filename (str): Desired name for the CSV file (without the .csv extension).
    :return: str: Full path where the CSV file was saved.
    """

    # Determine the appropriate subdirectory based on the filename
    subdirectory = ""
    if "nih" in filename.lower():
        subdirectory = "NIH"
    elif "chexpert" in filename.lower():
        subdirectory = "CheXpert"
    elif "padchest" in filename.lower():
        subdirectory = "PadChest"
    elif "mimic" in filename.lower():
        subdirectory = "Mimic"

    # Create or ensure the existence of the destination directory
    full_dest_path = os.path.join(dest_path, subdirectory)
    os.makedirs(full_dest_path, exist_ok=True)

    # Construct the full file path for the CSV
    file_path = os.path.join(full_dest_path, f"{filename}.csv")

    # Reorder columns, ensuring 'index' and 'unique_id' are first
    cols_order = ['index', 'unique_id'] + [col for col in dataframe.columns if col not in ['index', 'unique_id']]
    dataframe = dataframe.reindex(columns=cols_order)

    # Save the DataFrame to the desired location
    dataframe.to_csv(file_path, index=False)

    return file_path


In [36]:
overall_path = '/ssd2/averijordan/datasets'

In [37]:
def map_to_subfolder(dataframe_name):
    """
    Map a lowercase dataframe name to a corresponding subfolder name.
    
    :param dataframe_name (string): Name of the dataframe in lowercase format.
    :return: Name of the subfolder (first letter capitalized). If the name is 
             not recognized, it returns "Other".
    """
    # Define mapping from lowercase dataframe name to capitalized subfolder name.
    name_mapping = {
        "pneumonia": "Pneumonia",
        "cardiomegaly": "Cardiomegaly",
        "no_finding": "No-Finding",
        "effusion": "Effusion",
    }
    
    # Search for dataframe name in the mapping and return the corresponding value.
    for key in name_mapping:
        if key in dataframe_name:
            return name_mapping[key]
    
    # Return 'Other' if the name is not recognized.
    return "Other"


In [38]:
from concurrent.futures import ThreadPoolExecutor
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')


def reshape_and_save_image(row, path, counter, dataset_name, subfolder_name, view_subfolder):
    """
    Reads, reshapes, and saves images based on information from a dataframe row.
    
    :param row (pandas.Series): A row containing image details.
    :param path (string): The root directory path where the original images are.
    :param counter (int): Unique counter for generating filename.
    :param dataset_name (string): The source dataset name.
    :param subfolder_name (string): Label-based subfolder name.
    :param view_subfolder (string): Viewpoint-based subfolder name (e.g., AP, PA).
    :return: counter (int): Incremented counter.
    """
    # us patient id (based on whatever that is for the dataset) 
    unique_id = getattr(row, 'patientid')
    
    new_filename = f"{counter}_{dataset_name}_{subfolder_name}_{view_subfolder}_{unique_id}.png"
    local_path = getattr(row, 'local_path', None)

    # Only proceed if local_path is not None
    if local_path is not None:
        source_file_path = os.path.join(path, local_path)
        # Check source file path validity
        if not os.path.isfile(source_file_path):
            raise FileNotFoundError(f"The source file {source_file_path} does not exist.")

        dest_dir_path = os.path.join(path, dataset_name, subfolder_name, view_subfolder)
        if not os.path.isdir(dest_dir_path):
            os.makedirs(dest_dir_path, exist_ok=True)

        dest_file_path = os.path.join(dest_dir_path, new_filename)

        # Image loading and processing
        # [Remember to import skimage and torch libraries at the top if not already imported]
        img = skimage.io.imread(source_file_path)
        img = img / img.max()  # Normalize to [0,1]

        # Handling various image formats and dimensions
        if len(img.shape) == 3:
            if img.shape[2] == 4:
                img = skimage.color.rgba2rgb(img)
            img = skimage.color.rgb2gray(img)
        elif len(img.shape) < 2:
            print("error, dimension lower than 2 for image")

        img = img[None, :, :]
        img = torch.from_numpy(img)
        pil_transform = torchvision.transforms.ToPILImage(mode='L')
        img = pil_transform(img)
        img.save(dest_file_path, format='PNG')

        if counter % 2000 == 0:
            print(f"Processed: {source_file_path} -> {dest_file_path}")

        return counter
    else:
        # Handle the case where local_path is None - log it, skip, or take other action
        print(f"Warning: 'local_path' attribute is None for row with patientid {unique_id}")

def process_and_relocate_images(dataframe, meta_dataframe, dataframe_label, dataset_label, base_path, start_counter=1):
    """
    Reads, processes, and saves images in a structured directory format.

    :param dataframe (pandas.DataFrame): DataFrame with image data.
    :param meta_dataframe (pandas.DataFrame): Additional metadata for images.
    :param dataframe_label (str): Label/name of the dataframe.
    :param dataset_label (str): Label/name of the dataset.
    :param base_path (str): Base path for saving processed images.
    :param start_counter (int, optional): Initial counter for naming. Defaults to 1.
    """
    # Define folder structures
    finding_label = map_to_subfolder(dataframe_label)
    dataset_folder = os.path.join(base_path, dataset_label, finding_label)
    os.makedirs(dataset_folder, exist_ok=True)

    viewpoint_label = "AP" if "ap" in dataframe_label else "PA"
    viewpoint_folder = os.path.join(dataset_folder, viewpoint_label)
    os.makedirs(viewpoint_folder, exist_ok=True)

    logging.info(f"Starting to process and relocate images for {dataset_label}")
    with ThreadPoolExecutor() as executor:
        filenames_counter = list(executor.map(reshape_and_save_image, dataframe.itertuples(), 
                                              [base_path]*len(dataframe),
                                              range(start_counter, start_counter+len(dataframe)),
                                              [dataset_label]*len(dataframe), [finding_label]*len(dataframe),
                                              [viewpoint_label]*len(dataframe)))

    meta_dataframe['index'] = filenames_counter

def handle_multiple_datasets(dataset_dict, overall_path):
    """
    Manages processing of multiple datasets sequentially.

    :param dataset_dict (dict): Dictionary with dataset details.
    :param overall_path (str): Base path for all datasets.
    """
    for dataset_name, dataset_data in dataset_dict.items():
        logging.info(f"Processing dataset: {dataset_name}")
        process_and_relocate_images(dataset_data['data'], dataset_data['file_info'], 
                                    dataset_name, dataset_data['dataset_name'], overall_path) 
        # Assuming export_dataframe_to_csv is a function to save meta_dataframe into CSV.
        export_dataframe_to_csv(dataset_data['file_info'], overall_path, dataset_name)


In [39]:
def sample_dataframes(dataframes_dict, sample_size=1000, random_state=25):
    """
    For each dataset in the provided dictionary, this function samples a subset of the data
    and corresponding file information based on the given sample size.
    
    :param dataframes_dict (dict): Dictionary containing datasets. Each dataset should have 
                                   keys 'data' and 'file_info' pointing to their respective 
                                   dataframes.
    :param sample_size (int, optional): Number of samples to extract from each dataframe. 
                                        Defaults to 1000.
    :param random_state (int, optional): Random seed for reproducibility. Defaults to 42.
    """
    
    for key, value in dataframes_dict.items():
        data = value['data']
        file_info = value['file_info']
        
        # Reset the indices for both dataframes to ensure consistent indexing
        data.reset_index(drop=True, inplace=True)
        file_info.reset_index(drop=True, inplace=True)
        
        # Determine the number of samples to extract, limited by the dataset's size
        n_samples = min(sample_size, len(data))
        
        # Obtain random samples from the data, get their indices
        sampled_indices = data.sample(n=n_samples, random_state=random_state).index
        
        # Use the sampled indices to extract corresponding rows from both data and file_info dataframes
        value['data'] = data.loc[sampled_indices].reset_index(drop=True)
        value['file_info'] = file_info.loc[sampled_indices].reset_index(drop=True)


In [40]:


# Create a dictionary of datasets. 
# Each key in the dictionary represents a unique dataset identifier that combines the source, 
# the medical condition, and the view (AP or PA).
# The value for each key is another dictionary containing:
# - data: the main dataset
# - file_info: additional information about files in the dataset
# - dataset_name: the name of the source from which the dataset originated 
# (e.g., CheXpert or NIH).
datasets = {
    "d_padchest_no_finding_pa": {
        "data": d_padchest_no_finding_pa,
        "file_info": d_padchest_no_finding_pa_file_info,
        "dataset_name": "PadChest",
    },
    "d_padchest_cardiomegaly_pa": {
        "data": d_padchest_cardiomegaly_pa,
        "file_info": d_padchest_cardiomegaly_pa_file_info,
        "dataset_name": "PadChest",
    },
    "d_chexpert_pneumonia_ap": {
        "data": d_chexpert_pneumonia_ap,
        "file_info": d_chexpert_pneumonia_ap_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_pneumonia_pa": {
        "data": d_chexpert_pneumonia_pa,
        "file_info": d_chexpert_pneumonia_pa_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_cardiomegaly_ap": {
        "data": d_chexpert_cardiomegaly_ap,
        "file_info": d_chexpert_cardiomegaly_ap_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_cardiomegaly_pa": {
        "data": d_chexpert_cardiomegaly_pa,
        "file_info": d_chexpert_cardiomegaly_pa_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_effusion_ap": {
        "data": d_chexpert_effusion_ap,
        "file_info": d_chexpert_effusion_ap_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_effusion_pa": {
        "data": d_chexpert_effusion_pa,
        "file_info": d_chexpert_effusion_pa_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_no_finding_ap": {
        "data": d_chexpert_no_finding_ap,
        "file_info": d_chexpert_no_finding_ap_file_info,
        "dataset_name": "CheXpert",
    },
    "d_chexpert_no_finding_pa": {
        "data": d_chexpert_no_finding_pa,
        "file_info": d_chexpert_no_finding_pa_file_info,
        "dataset_name": "CheXpert",
    },
    "d_nih_pneumonia_ap": {
        "data": d_nih_pneumonia_ap,
        "file_info": d_nih_pneumonia_ap_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_pneumonia_pa": {
        "data": d_nih_pneumonia_pa,
        "file_info": d_nih_pneumonia_pa_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_cardiomegaly_ap": {
        "data": d_nih_cardiomegaly_ap,
        "file_info": d_nih_cardiomegaly_ap_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_cardiomegaly_pa": {
        "data": d_nih_cardiomegaly_pa,
        "file_info": d_nih_cardiomegaly_pa_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_effusion_ap": {
        "data": d_nih_effusion_ap,
        "file_info": d_nih_effusion_ap_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_effusion_pa": {
        "data": d_nih_effusion_pa,
        "file_info": d_nih_effusion_pa_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_no_finding_ap": {
        "data": d_nih_no_finding_ap,
        "file_info": d_nih_no_finding_ap_file_info,
        "dataset_name": "NIH",
    },
    "d_nih_no_finding_pa": {
        "data": d_nih_no_finding_pa,
        "file_info": d_nih_no_finding_pa_file_info,
        "dataset_name": "NIH",
    },
        "d_padchest_no_finding_pa": {
        "data": d_padchest_no_finding_pa,
        "file_info": d_padchest_no_finding_pa_file_info,
        "dataset_name": "PadChest",
    },
    "d_padchest_cardiomegaly_pa": {
        "data": d_padchest_cardiomegaly_pa,
        "file_info": d_padchest_cardiomegaly_pa_file_info,
        "dataset_name": "PadChest",
    }
}



In [108]:
# Sample each dataframe in the datasets dictionary to the specified size
sample_dataframes(datasets, sample_size=1200, random_state=50)

# Filter the datasets dictionary to only include datasets related to 
# ardiomegaly in PA view or no findings in PA view.
cardiomegaly_no_finding_pa_dataset = {k: v for k, v in datasets.items()
                                      if k.endswith('_cardiomegaly_pa') 
                                      or k.endswith('_no_finding_pa')}

# Iterate through the filtered dataset and print the number of rows each dataset contains
for key, value in cardiomegaly_no_finding_pa_dataset.items():
    # Get the number of rows from the 'data' dataframe in the current dataset
    num_rows = value['data'].shape[0]
    print(f"The dataset {key} has {num_rows} rows")


The dataset d_padchest_no_finding_pa has 1200 rows
The dataset d_padchest_cardiomegaly_pa has 1200 rows


In [109]:
# Specify the paths to the directories
nih_dir = "/ssd2/averijordan/datasets/NIH"
chex_dir = "/ssd2/averijordan/datasets/CheXpert"

# Check if the directories do not exist
if not os.path.exists(nih_dir) or not os.path.exists(chex_dir):
    # Call process_datasets() if either directory does not exist
    handle_multiple_datasets(cardiomegaly_no_finding_pa_dataset, "/ssd2/averijordan/datasets/csv/file_info")


2024-03-07 09:51:56,018 - INFO - Processing dataset: d_padchest_no_finding_pa
2024-03-07 09:51:56,019 - INFO - Starting to process and relocate images for PadChest


TypeError: join() argument must be str, bytes, or os.PathLike object, not 'NoneType'