In [1]:
import logging
from django.db import models
from utils.merge_model_objects import merge_instances
from rapidfuzz import fuzz
from tqdm import tqdm
from collections import Counter

In [2]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger('jupyter')
logger.info('logging works')

INFO: logging works


In [3]:
help(merge_instances)

Help on function merge_instances in module utils.merge_model_objects:

merge_instances(primary_object, *alias_objects, disable_signals=True)
    Merge several model instances into one, the `primary_object`.
    Use this function to merge model objects and migrate all of the related
    fields from the alias objects the primary object.
    Usage:
        from django.contrib.auth.models import User
        primary_user = User.objects.get(email='good@example.com')
        duplicate_user = User.objects.get(email='good+duplicate@example.com')
        merge(primary_user, duplicate_user)
    Based on: https://djangosnippets.org/snippets/382/
    Based on https://djangosnippets.org/snippets/2283/



# Run merge operations

In [4]:
def sort_images(images):
    def key_func(im):
        return -im.usage, -len(im.exif_data), im.created
    return sorted(
        images.annotate(usage=models.Count('storyimage')),
        key=key_func,
    )

def merge_duplicates(qs, attrs=('id',), sort_func=None):
    proc = tqdm(qs)
    for item in proc: 
        proc.set_description_str(f'{str(item)[:30]:<30} ')
        kwargs = {attr: getattr(item, attr) for attr in attrs} 
        clones = qs.filter(**kwargs)
        if len(clones) > 1:
            proc.set_postfix(item=str(item.pk), clones=len(clones)-1)
            if sort_func:
                clones = sort_func(clones)
            merge_instances(*clones)
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')
    
def merge_images_by_field(field='imagehash', qs = ImageFile.objects.all()):
    proc = tqdm(qs)
    for item in proc: 
        proc.set_description_str(f'{str(item)[:30]:<30} ')
        clones = item.similar(field) | qs.filter(pk=item.pk) 
        if len(clones) > 1:
            proc.set_postfix(item=str(item.pk), clones=len(clones)-1)
            merge_instances(*sort_images(clones))
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')
            
def merge_bylines():
    attrs = ['story', 'contributor', 'credit']
    qs = Byline.objects.all()
    merge_duplicates(qs, attrs)
    
def merge_images_by_md5():
    vals = [im['stat']['md5'] for im in ImageFile.objects.all().values('stat')]
    dupes = [h for h, n in Counter(vals).most_common() if n > 1]
    proc = tqdm(dupes)
    for md5 in proc:
        proc.set_description_str(f'md5: {md5}')
        imgs = ImageFile.objects.filter(stat__md5=md5)
        merge_instances(*list(imgs))
    
def _clone(*items):
    for item in items: 
        item.pk = None
        item.save()

def test_merge():
    _clone(*Byline.objects.order_by('?')[:3])
    merge_bylines()
    
    _clone(*ImageFile.objects.order_by('?')[:3])
    merge_images_by_field('md5')
    
    _clone(*ImageFile.objects.order_by('?')[:3])
    merge_images_by_field('imagehash')

def dupes(qs, attr):
    vals = qs.values_list(attr, flat=True)
    dupes = [h for h, n in Counter(vals).most_common() if n > 1]
    return qs.filter(**{f'{attr}__in': dupes})

merge_images_by_md5()                     

md5: f71b1912f137f830bc19be524fb8cdf6: 100%|██████████| 114/114 [00:07<00:00, 15.49it/s]


In [5]:
duplicates = dupes(ImageFile.objects.all(), '_imagehash')
merge_images_by_field('imagehash', duplicates)
        
#duplicates = dupes(ImageFile.objects.all(), 'stat')
#merge_images_by_field('md5', duplicates)

Selma-Joner.jpg                : 100%|██████████| 105/105 [00:32<00:00,  3.19it/s, clones=1, item=47393]
INFO: ImageFile count: 105 -> 60


In [18]:
[st['stat']['md5'] for st in ImageFile.objects.values('stat')]

['4187e2116097d4cd6217a2857d0639a0',
 '4d1d749e3de6f936b36507a906fc34c7',
 'c3e41dc70869c3fa5a574eb0d45a20d3',
 '778be7abb255698005defc64325d8f5d',
 'a433f22564e244cebec47646e0859f4c',
 '8031396aebcb513d3db42238e2c00ade',
 '5de0988eb9f3c6989df4526ad2df8583',
 '009e5f756fb5c8739b62d3b8bfafc8b3',
 'c1a1061cbd75e2afd493a6f4d16e1718',
 '65b0800c99c36f1a14cd4629a408fd27',
 'e741652282d54fcd03357a48585fd4fc',
 'd2b319b4ac053ce9ff3f7b42049a97dc',
 '5c053347d48184c1089a40556b3cc7b3',
 '09e9b868929ea4673caead38c5fcf51a',
 'f25dff0a425bdcd290e5a2e43df4556e',
 'ea23b909c6fec2e0ba596cd2ad003b59',
 '7b4de6e57b9679ce53eea1f5e807c745',
 '89f1779c0ee3a55a0c484ce13bbe4c0f',
 '27a428387ec43d5a841e008400358f02',
 '301e80fc7dd3109409fa814bb126b741',
 'f35fd039a67eee1370a7eca865658ff8',
 '6ba318c2ebc806810b1e21693c8caaed',
 '5ff98aac2e48be72144e7249d3126805',
 '1336b81f02c26a04e5a3da390c900e19',
 '13ac0149076c39970a1fc9d647f5268c',
 'd24739b0c04f4cbd868c38903effcf03',
 '8e441507f28cfe098532ed21ecc82adb',
 

In [41]:
def fuzz_diff(a, attr):
    def fn(b):
        return fuzz.ratio(getattr(a, attr), getattr(b, attr))
    return fn
        

def merge_contributors(cutoff=90):
    """find and merge contributors."""
    qs = Contributor.objects.order_by('-pk')
    for n, item in enumerate(qs):
        clones = qs.filter(display_name__trigram_similar=item.display_name)
        if len(clones) > 1:
            ratio = fuzz_diff(item, 'display_name')
            clones = [c for c in clones if ratio(c) > cutoff]
            clones.sort(key=keyfn, reverse=True)
        if len(clones) > 1:
            msg = f'{str(n):<5}: merge ' + ' + '.join(f'{c}({ratio(c)} {c.bylines_count()})' for c in clones)
            logger.info(msg)
            merge_instances(*clones)
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')

def keyfn(cn):
    return (cn.bylines_count(), bool(cn.email))

merge_contributors()

INFO: Contributor count: 2552 -> 2552
