In [None]:
# Original sample code is provided by Shlomo Kashani
# https://forum.isic-archive.com/t/a-list-of-duplicate-images-in-the-training-set/1141

import pandas as pd 
import os
import numpy as np
import pandas as pd
import glob
import imagehash
from PIL import Image
from joblib import Parallel, delayed
from pathlib import Path
import matplotlib.pyplot as plt

%matplotlib inline


def create_dataframe_from_folders(image_folders):
    image_paths = []
    for image_folder in image_folders:
        image_paths += sorted(Path(image_folder).glob('**/*.jpg'))
    print(len(image_paths), 'images')
    df = pd.DataFrame(image_paths, columns=['path'])
    df['image'] = df.apply(lambda row : Path(row['path']).stem, axis=1)
    return df

def get_image_metadata(file_path):
    """ Generate images metadata """
    with Image.open(file_path) as img:
        img_hash = imagehash.phash(img)
        return img.size, img.mode, img_hash, file_path
    
def get_image_metadata_parallel(df):
    img_meta_list = Parallel(n_jobs=os.cpu_count(), verbose=1)(
    (delayed(get_image_metadata)(row['path']) for _, row in df.iterrows()))
    img_meta_df = pd.DataFrame(np.array(img_meta_list), columns = ['Size', 'Mode', 'Hash', 'path'])
    df = df.merge(img_meta_df, on='path', how='left')
    return df

def plot_similar_pairs(df0, df1, similar_pair_indices):
    for i in range(0, len(similar_pair_indices[0])):
        idx0 = similar_pair_indices[0][i]
        idx1 = similar_pair_indices[1][i]
        print("{} v.s. {}".format(df0.iloc[idx0]['image'], df1.iloc[idx1]['image']))

        fig, (ax0, ax1) = plt.subplots(figsize=(14, 5), ncols=2)
        ax0.set_title(df0.iloc[idx0]['image'])
        ax0.imshow(plt.imread(df0.iloc[idx0]['path']))

        ax1.set_title(df1.iloc[idx1]['image'])
        ax1.imshow(plt.imread(df1.iloc[idx1]['path']))
        plt.show()

## Hash Difference Matrix

In [None]:
image_folders = ['C:/ISIC_2019/Out_Distribution']

df = create_dataframe_from_folders(image_folders)
df = get_image_metadata_parallel(df)
df.head()

In [None]:
# Pariwise hash difference matrix
pair_diff = abs(df['Hash'].to_numpy() - df['Hash'].to_numpy()[:, None])
print('Unique difference values:', np.unique(pair_diff))
pair_diff_triu = np.triu(pair_diff)

# Get the index of elements with value 2
similar_pair_indices = np.where((pair_diff_triu > 0) & (pair_diff_triu <= 8))
print(len(similar_pair_indices[0]), 'similar image pairs:')

In [None]:
plot_similar_pairs(df, df, similar_pair_indices)

## N x M Comparison

In [None]:
# image_folders = 

df0 = create_dataframe_from_folders(['C:/ISIC_2019/Out_Distribution'])
df0 = get_image_metadata_parallel(df0)

df1 = create_dataframe_from_folders(['C:/ISIC_2019/ISIC_2019_Training_Input'])
df1 = get_image_metadata_parallel(df1)

df0_indices = []
df1_indices = []

for df0_idx, value0 in df0['Hash'].items():
    for df1_idx, value1 in df1['Hash'].items():
        diff = abs(value0 - value1)
        if 0 <= diff <= 6:
            df0_indices.append(df0_idx)
            df1_indices.append(df1_idx)

In [None]:
similar_pair_indices = (np.asarray(df0_indices), np.asarray(df1_indices))
plot_similar_pairs(df0, df1, similar_pair_indices)