In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
from pathlib import Path
import os,sys
import math
sys.path.insert(0,"..")
from glob import glob
import matplotlib as plt
import shutil
import numpy as np
import pandas as pd
import pathlib
import torch
import torchvision
import torchxrayvision as xrv
import skimage
from torchvision.transforms.functional import to_pil_image

In [3]:
def convert_size(size_bytes):
    if size_bytes == 0:
        return "0B"
    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
    i = int(math.floor(math.log(size_bytes, 1024)))
    p = math.pow(1024, i)
    s = round(size_bytes / p, 2)
    return f"{s} {size_name[i]}"

def get_dir_size(start_path = '.'):
    total_size = 0
    total_files = 0
    for dirpath, dirnames, filenames in os.walk(start_path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total_files += 1
            if not os.path.islink(fp):
                total_size += os.path.getsize(fp)
    return convert_size(total_size), total_files


In [4]:
print(get_dir_size("/ssd/averijordan/datasets/CheXpert-v1.0-small/CheXpert"))
print(get_dir_size("/ssd/averijordan/datasets/CheXpert-v1.0-small/CheXpert_Reduced"))
print(get_dir_size("/ssd/averijordan/datasets/ChestXray-NIHCC/NIH"))
print(get_dir_size("/ssd/averijordan/datasets/ChestXray-NIHCC/NIH_Reduced"))


('1.39 GB', 28569)
('218.84 MB', 28569)
('8.84 GB', 23197)
('486.01 MB', 23197)


In [5]:
%%bash
du -sh /ssd/averijordan/datasets/CheXpert-v1.0-small/CheXpert
du -sh /ssd/averijordan/datasets/CheXpert-v1.0-small/CheXpert_Reduced
du -sh /ssd/averijordan/datasets/ChestXray-NIHCC/NIH
du -sh /ssd/averijordan/datasets/ChestXray-NIHCC/NIH_Reduced

1.5G	/ssd/averijordan/datasets/CheXpert-v1.0-small/CheXpert
277M	/ssd/averijordan/datasets/CheXpert-v1.0-small/CheXpert_Reduced
8.9G	/ssd/averijordan/datasets/ChestXray-NIHCC/NIH
534M	/ssd/averijordan/datasets/ChestXray-NIHCC/NIH_Reduced


In [6]:
transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(224)])

In [7]:
from concurrent.futures import ThreadPoolExecutor
import skimage.io
import skimage.color

def print_image_shapes(folder_path):
    for dirpath, _, filenames in os.walk(folder_path):
        for filename in filenames:
            if filename.endswith('.jpg') or filename.endswith('.png'):
                image_path = os.path.join(dirpath, filename)
                img = skimage.io.imread(image_path)
                print(f"Image {filename} shape: {img.shape}")

def process_image(source_file_path, image_folder, file_format):
    dest_dir_path = os.path.join(image_folder, file_format.upper())
    os.makedirs(dest_dir_path, exist_ok=True)
    base_filename, _ = os.path.splitext(os.path.basename(source_file_path))
    dest_file_path = os.path.join(dest_dir_path, f"{base_filename}.{file_format.lower()}")

    img = skimage.io.imread(source_file_path)
    img = img / img.max()

    if len(img.shape) == 3:
        if img.shape[2] == 4:
            img = skimage.color.rgba2rgb(img)
        img = skimage.color.rgb2gray(img)
    elif len(img.shape) < 2:
        print("error, dimension lower than 2 for image")

    img = img[None, :, :]

    if transform is not None:
        img = transform(img)

    img = torch.from_numpy(img)

    pil_transform = torchvision.transforms.ToPILImage(mode='L')
    img = pil_transform(img)

    save_format = file_format if file_format != 'JPG' else 'JPEG'  # Use 'JPEG' for PIL saving
    img.save(dest_file_path, format=save_format)

def process_images_in_folder(image_folder):
    original_size, original_files = get_dir_size(image_folder)
    print(f"Original size of folder {image_folder}: {original_size}")

    image_paths = [os.path.join(image_folder, filename) for filename in os.listdir(image_folder) if filename.endswith('.jpg') or filename.endswith('.png')]

    for file_format in ['JPG', 'PNG']:
        with ThreadPoolExecutor() as executor:
            list(executor.map(process_image, image_paths, [image_folder]*len(image_paths), [file_format]*len(image_paths)))

    jpg_size, jpg_files = get_dir_size(os.path.join(image_folder, 'JPG'))
    png_size, png_files = get_dir_size(os.path.join(image_folder, 'PNG'))
    print(f"New JPG folder size: {jpg_size}, New PNG folder size: {png_size}")
    print_image_shapes(os.path.join(image_folder, 'JPG'))
    print_image_shapes(os.path.join(image_folder, 'PNG'))


In [8]:
chexpert_test_image_path = '/ssd/averijordan/datasets/test_pictures/CheXpert'
nih_test_image_path = '/ssd/averijordan/datasets/test_pictures/NIH'

process_images_in_folder(chexpert_test_image_path)
process_images_in_folder(nih_test_image_path)

Original size of folder /ssd/averijordan/datasets/test_pictures/CheXpert: 263.21 KB
New JPG folder size: 39.67 KB, New PNG folder size: 155.49 KB
Image 4_CheXpert_cardiomegaly_AP_00271.jpg shape: (224, 224)
Image 1_CheXpert_cardiomegaly_AP_00007.jpg shape: (224, 224)
Image 3_CheXpert_cardiomegaly_AP_00200.jpg shape: (224, 224)
Image 5_CheXpert_cardiomegaly_AP_00279.jpg shape: (224, 224)
Image 2_CheXpert_cardiomegaly_AP_00153.jpg shape: (224, 224)
Image 1_CheXpert_cardiomegaly_AP_00007.png shape: (224, 224)
Image 2_CheXpert_cardiomegaly_AP_00153.png shape: (224, 224)
Image 3_CheXpert_cardiomegaly_AP_00200.png shape: (224, 224)
Image 5_CheXpert_cardiomegaly_AP_00279.png shape: (224, 224)
Image 4_CheXpert_cardiomegaly_AP_00271.png shape: (224, 224)
Original size of folder /ssd/averijordan/datasets/test_pictures/NIH: 1.81 MB
New JPG folder size: 26.1 KB, New PNG folder size: 103.0 KB
Image 5_NIH_cardiomegaly_AP_00003990_000.jpg shape: (224, 224)
Image 3_NIH_cardiomegaly_AP_00000740_000.jpg