In [1]:
from glob import glob
import os

input_directory = './testinputdir'
files = glob(os.path.join(input_directory, '*'))
print(files[:2])

['./testinputdir/IMG_20220511_145538.HEIC', './testinputdir/IMG_20220511_180512.HEIC']


In [2]:
import pyheif
import io
from PIL import Image

def dump_input_heic_as_jpg(input_path, output_path):
    with open(input_path, mode='rb') as f:
        i = pyheif.read_heif(f)

    # Convert to other file format like jpeg
    s = io.BytesIO()
    pi = Image.frombytes(mode=i.mode, size=i.size, data=i.data)
    pi.save(s, format="jpeg")

    with open(output_path, mode='wb') as f:
        f.write(s.getvalue())

In [3]:
# from tempfile import TemporaryDirectory
import shutil
from pathlib import Path    

mapping = {}

p = './tempdir'
# with TemporaryDirectory() as p:

for i, input_path in enumerate(files):
    filename = Path(input_path).name
    filename_extension = filename.split('.')[1].lower()
    output_filename = filename.split('.')[0] + '.jpg'
    output_path = os.path.join(p, output_filename)
    print(f'[{i}/{len(files)}] {input_path} -> {output_path}')

    if filename_extension == 'heic':
        dump_input_heic_as_jpg(input_path, output_path)
    elif filename_extension == 'jpg':
        shutil.copyfile(input_path, output_path)
    else:
        print(f'Ignoring {input_path}')
        continue
    mapping[input_path] = output_filename

[0/42] ./testinputdir/IMG_20220511_145538.HEIC -> ./tempdir/IMG_20220511_145538.jpg
[1/42] ./testinputdir/IMG_20220511_180512.HEIC -> ./tempdir/IMG_20220511_180512.jpg
[2/42] ./testinputdir/IMG_20220511_180611.HEIC -> ./tempdir/IMG_20220511_180611.jpg
[3/42] ./testinputdir/IMG_20220511_180640.HEIC -> ./tempdir/IMG_20220511_180640.jpg
[4/42] ./testinputdir/IMG_20220511_180642.HEIC -> ./tempdir/IMG_20220511_180642.jpg
[5/42] ./testinputdir/IMG_20220511_180657.HEIC -> ./tempdir/IMG_20220511_180657.jpg
[6/42] ./testinputdir/IMG_20220511_180659.heic -> ./tempdir/IMG_20220511_180659.jpg
[7/42] ./testinputdir/IMG_20220511_180704.heic -> ./tempdir/IMG_20220511_180704.jpg
[8/42] ./testinputdir/IMG_20220511_180706.HEIC -> ./tempdir/IMG_20220511_180706.jpg
[9/42] ./testinputdir/IMG_20220511_180706_1.heic -> ./tempdir/IMG_20220511_180706_1.jpg
[10/42] ./testinputdir/IMG_20220511_180740.HEIC -> ./tempdir/IMG_20220511_180740.jpg
[11/42] ./testinputdir/IMG_20220511_180758.HEIC -> ./tempdir/IMG_202205

In [4]:
from imagededup.methods import PHash

phasher = PHash()
encodings = phasher.encode_images('tempdir')
# duplicates = phasher.find_duplicates(encodings)

2022-05-23 16:05:20,279: INFO Start: Calculating hashes...
100%|██████████| 40/40 [00:06<00:00,  6.35it/s]
2022-05-23 16:05:26,700: INFO End: Calculating hashes!


In [5]:
duplicates = phasher.find_duplicates(encoding_map=encodings)

2022-05-23 16:05:26,729: INFO Start: Evaluating hamming distances for getting duplicates
2022-05-23 16:05:26,730: INFO Start: Retrieving duplicates using Cython Brute force algorithm
100%|██████████| 40/40 [00:00<00:00, 207126.12it/s]
2022-05-23 16:05:26,917: INFO End: Retrieving duplicates using Cython Brute force algorithm
2022-05-23 16:05:26,918: INFO End: Evaluating hamming distances for getting duplicates


In [6]:
duplicates

{'IMG_20220511_145538.jpg': [],
 'IMG_20220511_180512.jpg': [],
 'IMG_20220511_180611.jpg': [],
 'IMG_20220511_180640.jpg': [],
 'IMG_20220511_180642.jpg': [],
 'IMG_20220511_180657.jpg': [],
 'IMG_20220511_180659.jpg': [],
 'IMG_20220511_180704.jpg': [],
 'IMG_20220511_180706.jpg': [],
 'IMG_20220511_180706_1.jpg': [],
 'IMG_20220511_180740.jpg': ['IMG_20220511_180758.jpg'],
 'IMG_20220511_180758.jpg': ['IMG_20220511_180740.jpg'],
 'IMG_20220511_180823.jpg': [],
 'IMG_20220511_180937.jpg': [],
 'IMG_20220511_180952.jpg': [],
 'IMG_20220511_181008.jpg': [],
 'IMG_20220511_181039.jpg': [],
 'IMG_20220511_181059.jpg': [],
 'IMG_20220511_181107.jpg': [],
 'IMG_20220511_181219.jpg': [],
 'IMG_20220511_181307.jpg': [],
 'IMG_20220511_181336.jpg': [],
 'IMG_20220511_181345.jpg': [],
 'IMG_20220511_181356.jpg': [],
 'IMG_20220511_181543.jpg': [],
 'IMG_20220511_181552.jpg': [],
 'IMG_20220511_181713.jpg': [],
 'IMG_20220511_181716.jpg': [],
 'IMG_20220511_181731.jpg': [],
 'IMG_20220511_18175