In [None]:
# split the data into train, val and test folders
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split
from tqdm import tqdm


# Read the CSV file and shuffle the rows
df = pd.read_csv('/mnt/g/Datasets/Body_Parts_XRay/train_df.csv').sample(frac=1).reset_index(drop=True)
# Create a new column of filename
df['filename'] = df['image_path'].apply(lambda x: os.path.basename(x))
# save the df
df.to_csv('/mnt/g/Datasets/Body_Parts_XRay/train_df_new.csv', index=False)

source_folder_path = '/mnt/g/Datasets/Body_Parts_XRay/images/train/'
dest_folder_path = '/mnt/g/Datasets/Body_Parts_XRay/Original/'

# Split the data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Define a function to copy files
def copy_files(df, dest_folder):
    for index, row in df.iterrows():
        filename = row['filename']
        src_file = os.path.join(source_folder_path, filename)
        dest_file = os.path.join(dest_folder_path, dest_folder, 'images',filename)
        shutil.copy(src_file, dest_file)

# Copy files to their respective folders
copy_files(train_df, 'train')
copy_files(val_df, 'val')
copy_files(test_df, 'test')

In [5]:
from PIL import Image
import numpy as np
source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/train/images/1.2.826.0.1.3680043.8.498.10038426859954986240523417641213777162-c.png'
img = Image.open(source_folder)
img = np.array(img)
print(img)

[[200 201 202 ...  27  27  27]
 [199 200 201 ...  26  27  27]
 [203 202 202 ...  26  27  27]
 ...
 [ 27  27  27 ...  17   0   1]
 [ 26  27  27 ...  35   0   0]
 [ 27  27  26 ...  50   8   0]]


In [6]:
# Remove 4 black regions at all 4 edges. Add black padding to make it square
from PIL import Image
import os

def remove_dark_edges(source_folder, destination_folder):
    """Remove dark edges from the image."""

    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)

    def remove_dark_edges(img):
        """Remove dark edges from the image."""
        data = img.load()
        width, height = img.size
        
        left, right = 0, width
        top, bottom = 0, height
        
        def is_dark(pixel):
            if isinstance(pixel, int):
                return pixel < 5
            return pixel < (5, 5, 5)
        
        # Check left edge
        for x in range(width):
            if not is_dark(data[x, height // 2]):
                left = x
                break
                
        # Check right edge
        for x in range(width - 1, -1, -1):
            if not is_dark(data[x, height // 2]):
                right = x + 1
                break
                
        # Check top edge
        for y in range(height):
            if not is_dark(data[width // 2, y]):
                top = y
                break
                
        # Check bottom edge
        for y in range(height - 1, -1, -1):
            if not is_dark(data[width // 2, y]):
                bottom = y + 1
                break
                
        return img.crop((left, top, right, bottom))

    def add_black_padding(img):
        """Add black padding to make the image square."""
        width, height = img.size
        size = max(width, height)
        
        new_img = Image.new("RGB", (size, size), color=(0, 0, 0))
        new_img.paste(img, ((size - width) // 2, (size - height) // 2))
        
        return new_img

    for image_file in os.listdir(source_folder):
        if image_file.endswith('.png'):
            img_path = os.path.join(source_folder, image_file)
            img = Image.open(img_path)
            
            img = remove_dark_edges(img)
            if img.width != img.height:
                img = add_black_padding(img)
                
            img.save(os.path.join(destination_folder, image_file))


source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/train/images'
destination_folder = '/mnt/g/Datasets/Body_Parts_XRay/Square/train/images'
remove_dark_edges(source_folder, destination_folder)

source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/val/images'
destination_folder = '/mnt/g/Datasets/Body_Parts_XRay/Square/val/images'
remove_dark_edges(source_folder, destination_folder)

source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/test/images'
destination_folder = '/mnt/g/Datasets/Body_Parts_XRay/Square/test/images'
remove_dark_edges(source_folder, destination_folder)

In [None]:
source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/test/images/'
image_files = [f for f in os.listdir(source_folder) if f.endswith('.png')]
print(image_files[:5])

In [None]:
# https://github.com/mlfoundations/open_clip/issues/439


In [None]:
# generate captions.txt for image folder

import os
# Dictionary to map integers to tissue types
tissue_map = {
    0: "Abdomen",
    1: "Ankle",
    2: "Cervical Spine",
    3: "Chest",
    4: "Clavicles",
    5: "Elbow",
    6: "Feet",
    7: "Finger",
    8: "Forearm",
    9: "Hand",
    10: "Hip",
    11: "Knee",
    12: "Lower Leg",
    13: "Lumbar Spine",
    14: "Others",
    15: "Pelvis",
    16: "Shoulder",
    17: "Sinus",
    18: "Skull",
    19: "Thigh",
    20: "Thoracic Spine",
    21: "Wrist"
}

def generate_caption (source_folder, output_file):
    # List all jpg images in the source folder
    image_files = [f for f in os.listdir(source_folder) if f.endswith('.png')]

    # Extract the base name of the image files
    df = pd.read_csv('/mnt/g/Datasets/Body_Parts_XRay/train_df_new.csv')

    # Create a new column 'tissue_types' to store the tissue types in a readable format
    def map_to_tissue(target):
        tissue_types = [tissue_map[int(t)] for t in target.split()]
        return ", ".join(tissue_types)
    df['tissue_types'] = df['Target'].apply(map_to_tissue)
    # Optionally, save the updated dataframe to a new CSV
    df.to_csv('/mnt/g/Datasets/Body_Parts_XRay/train_df_new1.csv', index=False)

    # Open the output txt file for writing
    with open(output_file, 'w') as f:
        for filename in image_files:
            base_name = os.path.splitext(filename)[0]
            #Look up the tissue types based on the filename
            image_tissue_types = df[df['filename'] == filename]['tissue_types'].values[0]
            # Write to the txt file
            f.write(f"{base_name}\t{image_tissue_types}\n")


source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/test/images/'
output_file = '/mnt/g/Datasets/Body_Parts_XRay/Original/test/captions.txt'
generate_caption (source_folder, output_file)

source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/val/images/'
output_file = '/mnt/g/Datasets/Body_Parts_XRay/Original/val/captions.txt'
generate_caption (source_folder, output_file)

source_folder = '/mnt/g/Datasets/Body_Parts_XRay/Original/train/images/'
output_file = '/mnt/g/Datasets/Body_Parts_XRay/Original/train/captions.txt'
generate_caption (source_folder, output_file)

In [None]:
# python generate_roco_csv.py --dataset_dir "/mnt/e/Temp_data/Body_Parts_XRay/Original/train" --input_text_name "captions.txt" --out_dir "../Body_Parts_XRay/" --out_csv_file "train.csv"
# python generate_roco_csv.py --dataset_dir "/mnt/e/Temp_data/Body_Parts_XRay/Original/val/" --input_text_name "captions.txt" --out_dir "../Body_Parts_XRay/" --out_csv_file "val.csv"
# python generate_roco_csv.py --dataset_dir "/mnt/e/Temp_data/Body_Parts_XRay/Original/test" --input_text_name "captions.txt" --out_dir "../Body_Parts_XRay/" --out_csv_file "test.csv"