In [None]:
from apps.photo import file_operations 
from django.contrib.postgres.search import TrigramSimilarity
import PIL.Image
from apps.photo.file_operations import (
    pil_image, image_to_fingerprint, image_from_fingerprint
)
import imagehash

In [None]:
import IPython
import io
import base64

def as_img_tag(imagefile):
    image = pil_image(imagefile.small)
    blob = io.BytesIO()
    image.save(blob, 'PNG')
    data = base64.encodebytes(blob.getvalue()).decode('ascii').replace('\n', '')
    return f'<img style="margin: 0" src="data:image/png/;base64,{data}" >'

IPython.display.HTML(as_img_tag(ImageFile.objects.last()))

In [None]:
sql="CREATE INDEX trgm_idx ON photo_imagefile USING GIN (_imagehash gin_trgm_ops);"
from django.db import connections
with connections['default'].cursor() as cursor:
    cursor.execute(sql)

In [None]:
for imf in ImageFile.objects.all():
    print(imf, imf.imagehashes)

In [None]:
from statistics import median

def get_dupes(qs, ahash, limit):
    return qs.annotate(
        hash_similar=TrigramSimilarity('_imagehash', str(ahash)),
    ).filter(
        hash_similar__gt=limit,
    ).order_by('-hash_similar')

def sort_dupes(dupes, master_hashes, n=3):
    diff_pk = []
    for dupe in dupes[:10]:
        diffs = [val - master_hashes[key] for key, val in dupe.imagehashes.items()]
        diff = median(sorted(diffs)[:3])
        if diff < 10: 
            diff_pk.append((diff, dupe.pk))
    if not diff_pk:
        return dupes.none()
    diff_pk.sort()
    best = diff_pk[0][0] + 0.1
    return dupes.filter(pk__in=[pk for diff, pk in diff_pk if diff / best < 1.5][:n])

def imagehash_search(qs, master):
    master_hashes = file_operations.get_imagehashes(master)
    dupes = get_dupes(qs, master_hashes['ahash'], 0.1)
    dupes = sort_dupes(dupes, master_hashes)
    return dupes
    

In [None]:
def show_dupes(n=5):
    html = ''
    for photo in ImageFile.objects.order_by('?')[:n]:
        img = image_from_fingerprint(image_to_fingerprint(pil_image(photo.small)))
        results = imagehash_search(ImageFile.objects.all(), img)
        html += f'<h2>{photo}</h2>'
        html += f'<div style="display:flex; align-items: flex-start">'
        html += as_img_tag(photo)
        html += ''.join(as_img_tag(im) for im in results)
        html += f'</div>'
            
    return HTML(html)
show_dupes(50)

In [None]:
qs = ImageFile.objects.all()
photo = qs.order_by('?').first()
master = photo.small
master_hashes = file_operations.get_imagehashes(master)
%timeit dupes = get_dupes(qs, master_hashes['ahash'], 0.1)
sort_dupes(dupes, master_hashes)