In [None]:
# Original sample code is provided by Shlomo Kashani
# https://forum.isic-archive.com/t/a-list-of-duplicate-images-in-the-training-set/1141

import pandas as pd 
import os
import numpy as np
import pandas as pd
import glob
import imagehash
from PIL import Image
from joblib import Parallel, delayed
from pathlib import Path
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
data_folder = 'C:\ISIC_2019'
out_dist_image_folder = os.path.join(data_folder, 'ISIC_Archive_Out_Distribution')

print(len(sorted(Path(out_dist_image_folder).glob('**/*.jpg'))), 'out-of-distribution images')
df = pd.DataFrame([Path(x).stem for x in sorted(Path(out_dist_image_folder).glob('**/*.jpg'))],
                           columns =['image'])
df['path'] = df.apply(lambda row : os.path.join(out_dist_image_folder, row['image']+'.jpg'), axis=1)

def getImageMetaData(file_path):
    with Image.open(file_path) as img:
        img_hash = imagehash.phash(img)
        return img.size, img.mode, img_hash, file_path

# Generate images metadata
img_meta_list = Parallel(n_jobs=os.cpu_count(), verbose=1)(
    (delayed(getImageMetaData)(row['path']) for _, row in df.iterrows())
)
img_meta_df = pd.DataFrame(np.array(img_meta_list), columns = ['Size', 'Mode', 'Hash', 'path'])
df = df.merge(img_meta_df, on='path', how='left')
df.head()

## Find out similar image pairs

In [None]:
# Pariwise hash difference
pair_diff = abs(df['Hash'].to_numpy() - df['Hash'].to_numpy()[:, None])
print('Unique difference values:', np.unique(pair_diff))
pair_diff_triu = np.triu(pair_diff)

# Get the index of elements with value 2
print('Similar image pairs:')
similar_pair = np.where((pair_diff_triu > 0) & (pair_diff_triu <= 8))
# print(similar_pair)
for i in range(0, len(similar_pair[0])):
    idx0 = similar_pair[0][i]
    idx1 = similar_pair[1][i]
    # print("{} v.s. {}".format(df.iloc[idx0]['image'], df.iloc[idx1]['image']))
    
    fig, (ax0, ax1) = plt.subplots(figsize=(14, 5), ncols=2)
    ax0.set_title(df.iloc[idx0]['image'])
    ax0.imshow(plt.imread(df.iloc[idx0]['path']))
    
    ax1.set_title(df.iloc[idx1]['image'])
    ax1.imshow(plt.imread(df.iloc[idx1]['path']))