In [None]:
import logging
from django.db import models
from utils.merge_model_objects import merge_instances
from fuzzywuzzy import fuzz
from tqdm import tqdm
from collections import Counter

In [1]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger('jupyter')
logger.info('logging works')

INFO: logging works


In [2]:
help(merge_instances)

Help on function merge_instances in module utils.merge_model_objects:

merge_instances(primary_object, *alias_objects, disable_signals=True)
    Merge several model instances into one, the `primary_object`.
    Use this function to merge model objects and migrate all of the related
    fields from the alias objects the primary object.
    Usage:
        from django.contrib.auth.models import User
        primary_user = User.objects.get(email='good@example.com')
        duplicate_user = User.objects.get(email='good+duplicate@example.com')
        merge(primary_user, duplicate_user)
    Based on: https://djangosnippets.org/snippets/382/
    Based on https://djangosnippets.org/snippets/2283/



# Run merge operations

In [3]:

def sort_images(images):
    def key_func(im):
        return -im.usage, -len(im.exif_data), im.created
    return sorted(
        images.annotate(usage=models.Count('storyimage')),
        key=key_func,
    )

def merge_duplicates(qs, attrs=('id',), sort_func=None):
    proc = tqdm(qs)
    for item in proc: 
        proc.set_description_str(f'{str(item)[:30]:<30} ')
        kwargs = {attr: getattr(item, attr) for attr in attrs} 
        clones = qs.filter(**kwargs)
        if len(clones) > 1:
            proc.set_postfix(item=str(item.pk), clones=len(clones)-1)
            if sort_func:
                clones = sort_func(clones)
            merge_instances(*clones)
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')
    
def merge_images_by_field(field='imagehash', qs = ImageFile.objects.all()):
    proc = tqdm(qs)
    for item in proc: 
        proc.set_description_str(f'{str(item)[:30]:<30} ')
        clones = item.similar(field) | qs.filter(pk=item.pk) 
        if len(clones) > 1:
            proc.set_postfix(item=str(item.pk), clones=len(clones)-1)
            merge_instances(*sort_images(clones))
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')
            
def merge_bylines():
    attrs = ['story', 'contributor', 'credit']
    qs = Byline.objects.all()
    merge_duplicates(qs, attrs)
    
def merge_images_by_md5():
    attrs = ['_md5']
    qs = ImageFile.objects.all()
    merge_duplicates(qs, attrs, sort_func=sort_images)
    
    
def _clone(*items):
    for item in items: 
        item.pk = None
        item.save()

def test_merge():
    _clone(*Byline.objects.order_by('?')[:3])
    merge_bylines()
    
    _clone(*ImageFile.objects.order_by('?')[:3])
    merge_images_by_field('md5')
    
    _clone(*ImageFile.objects.order_by('?')[:3])
    merge_images_by_field('imagehash')



def dupes(qs, attr):
    vals = qs.values_list(attr, flat=True)
    dupes = [h for h, n in Counter(vals).most_common() if n > 1]
    return qs.filter(**{f'{attr}__in': dupes})


In [7]:
merge_images_by_field('imagehash', dupes(ImageFile.objects.all(), '_imagehash'))

27-karttyveri-01-TCST.jpg      : 100%|██████████| 2/2 [00:00<00:00, 11.54it/s]
INFO: ImageFile count: 2 -> 2


In [None]:
def fuzz_diff(a, attr):
    def fn(b):
        return fuzz.ratio(getattr(a, attr), getattr(b, attr))
    return fn
        

def merge_contributors(cutoff=90):
    qs = Contributor.objects.order_by('-pk')
    for n, item in enumerate(qs):
        clones = qs.filter(display_name__trigram_similar=item.display_name)
        if len(clones) > 1:
            ratio = fuzz_diff(item, 'display_name')
            clones = [c for c in clones if ratio(c) > cutoff]
            clones.sort(key=keyfn, reverse=True)
        if len(clones) > 1:
            msg = f'{n}: merge ' + ' + '.join(f'{c}({ratio(c)} {c.bylines_count()})' for c in clones)
            logger.info(msg)
            #merge_instances(*clones)
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')

def keyfn(cn):
    return (cn.bylines_count(), bool(cn.email))

merge_contributors()

In [19]:
cn = Contributor.objects.first()
cn.bylines_count()


108