In [1]:
import os 
import hashlib
from glob import glob
from PIL import Image
from tqdm import tqdm
import imagehash 
from imagehash import phash 
import shutil


In [2]:
PROJECT_ROOT = os.getcwd()

In [3]:
train_path = os.path.join(PROJECT_ROOT, 'train_folder')

In [4]:
external_path = os.path.join(PROJECT_ROOT, 'external_folder')

In [5]:
def perceptual_image_hash(image_path):
    try:
        with Image.open(image_path).convert('L') as img:
            img = img.resize((224, 224))
            #return hashlib.md5(img.tobytes()).hexdigest()
            return str(imagehash.phash(img))
    except Exception as e:
        print(f"Error with image {image_path}: {e}")
        return None

def get_image_hashes(image_folder):
    image_paths = glob(os.path.join(image_folder, "**", '*.*'), recursive = True)
    #print(f"Scanning {len(image_paths)} images in {image_folder}..")
    hashes = {}
    for path in tqdm(image_paths):
        h = perceptual_image_hash(path)
        if h:
            if h not in hashes:
                hashes[h] = [path]
            else:
                hashes[h].append(path)
    return hashes

train_folder = train_path
external_folder = external_path

train_hashes = get_image_hashes(train_folder)
external_hashes = get_image_hashes(external_folder)

# detect overlapping hashes
train_hash_set = set(train_hashes.keys())
external_hash_set = set(external_hashes.keys())
overlap_hashes = train_hash_set.intersection(external_hash_set)



# gather all duplicated external image paths

duplicate_external_path = []
for h in overlap_hashes:
    duplicate_external_path.extend(external_hashes[h])


print(f"Total duplicated image files in external set: {len(duplicate_external_path)}")




100%|██████████| 2008/2008 [00:06<00:00, 299.40it/s]
100%|██████████| 5000/5000 [00:14<00:00, 346.57it/s]

Total duplicated image files in external set: 1649





In [6]:
external_raw_path_tumor = os.path.join(PROJECT_ROOT, 'brain tumor external raw/Tumor')
external_path_tumor = os.path.join(PROJECT_ROOT, 'brain tumor external data/Tumor')
external_raw_path_no_tumor = os.path.join(PROJECT_ROOT, 'brain tumor external raw/Healthy')
external_path_no_tumor = os.path.join(PROJECT_ROOT, 'brain tumor external data/Healthy')

In [7]:
os.makedirs(external_path_tumor, exist_ok=True)
os.makedirs(external_path_no_tumor, exist_ok=True)

In [8]:
# check external image (tumor) against training images (tumor + no-tumor)
def perceptual_image_hash(image_path):
    try:
        with Image.open(image_path).convert('L') as img:
            img = img.resize((224, 224))
            #return hashlib.md5(img.tobytes()).hexdigest()
            return str(imagehash.phash(img))
    except Exception as e:
        print(f"Error with image {image_path}: {e}")
        return None

def get_image_hashes(image_folder):
    image_paths = glob(os.path.join(image_folder, "**", '*.*'), recursive = True)
    #print(f"Scanning {len(image_paths)} images in {image_folder}..")
    hashes = {}
    for path in tqdm(image_paths):
        h = perceptual_image_hash(path)
        if h:
            if h not in hashes:
                hashes[h] = [path]
            else:
                hashes[h].append(path)
    return hashes

#train_folder = train_path
#external_folder_tumor = external_path_tumor

train_hashes = get_image_hashes(train_path)
external_hashes_tumor = get_image_hashes(external_raw_path_tumor)

# detect overlapping hashes
train_hash_set = set(train_hashes.keys())
external_hash_set_tumor = set(external_hashes_tumor.keys())
overlap_hashes_tumor = train_hash_set.intersection(external_hash_set_tumor)

# gather all duplicated external image paths

duplicate_external_tumor_path = []
for h in overlap_hashes_tumor:
    duplicate_external_tumor_path.extend(external_hashes_tumor[h])


print(f"Total duplicated image files in external tumor set: {len(duplicate_external_tumor_path)}")

# copy no-duplicated images to cleaned folder
all_external_paths = [p for paths in external_hashes_tumor.values() for p in paths]
non_duplicated_paths = set(all_external_paths)- set(duplicate_external_tumor_path)
print(f"Copying {len(non_duplicated_paths)} non-duplicated tumor images to cleaned folder")

for path in tqdm(non_duplicated_paths, desc='Copying cleaned tumor images'):
    filename = os.path.basename(path)
    desc_path = os.path.join(external_path_tumor, filename)
    try:
        shutil.copy2(path, desc_path)
    except Exception as e:
        print(f"Failed to copy {path} to {desc_path}: e")


100%|██████████| 2008/2008 [00:06<00:00, 308.34it/s]
100%|██████████| 3000/3000 [00:09<00:00, 307.96it/s]


Total duplicated image files in external tumor set: 877
Copying 2123 non-duplicated tumor images to cleaned folder


Copying cleaned tumor images: 100%|██████████| 2123/2123 [00:00<00:00, 14882.44it/s]


In [9]:
# check external image (no-tumor) against training images (tumor + no-tumor)
def perceptual_image_hash(image_path):
    try:
        with Image.open(image_path).convert('L') as img:
            img = img.resize((224, 224))
            #return hashlib.md5(img.tobytes()).hexdigest()
            return str(imagehash.phash(img))
    except Exception as e:
        print(f"Error with image {image_path}: {e}")
        return None

def get_image_hashes(image_folder):
    image_paths = glob(os.path.join(image_folder, "**", '*.*'), recursive = True)
    #print(f"Scanning {len(image_paths)} images in {image_folder}..")
    hashes = {}
    for path in tqdm(image_paths):
        h = perceptual_image_hash(path)
        if h:
            if h not in hashes:
                hashes[h] = [path]
            else:
                hashes[h].append(path)
    return hashes



train_hashes = get_image_hashes(train_path)
external_hashes_no_tumor = get_image_hashes(external_raw_path_no_tumor)

# detect overlapping hashes
train_hash_set = set(train_hashes.keys())
external_hash_set_no_tumor = set(external_hashes_no_tumor.keys())
overlap_hashes_no_tumor = train_hash_set.intersection(external_hash_set_no_tumor)

# gather all duplicated external image paths

duplicate_external_no_tumor_path = []
for h in overlap_hashes_no_tumor:
    duplicate_external_no_tumor_path.extend(external_hashes_no_tumor[h])


print(f"Total duplicated image files in external no tumor set: {len(duplicate_external_no_tumor_path)}")

# copy no-duplicated images to cleaned folder
all_external_paths = [p for paths in external_hashes_no_tumor.values() for p in paths]
non_duplicated_paths = set(all_external_paths)- set(duplicate_external_no_tumor_path)
print(f"Copying {len(non_duplicated_paths)} non-duplicated no tumor images to cleaned folder")

for path in tqdm(non_duplicated_paths, desc='Copying cleaned no tumor images'):
    filename = os.path.basename(path)
    desc_path = os.path.join(external_path_no_tumor, filename)
    try:
        shutil.copy2(path, desc_path)
    except Exception as e:
        print(f"Failed to copy {path} to {desc_path}: e")

100%|██████████| 2008/2008 [00:06<00:00, 308.92it/s]
100%|██████████| 2000/2000 [00:04<00:00, 422.31it/s]


Total duplicated image files in external no tumor set: 772
Copying 1228 non-duplicated no tumor images to cleaned folder


Copying cleaned no tumor images: 100%|██████████| 1228/1228 [00:00<00:00, 16422.97it/s]
