In [15]:
import logging
from django.db import models
from utils.merge_model_objects import merge_instances
from fuzzywuzzy import fuzz
from tqdm import tqdm
from collections import Counter

In [16]:
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')
logger = logging.getLogger('jupyter')
logger.info('logging works')

INFO: logging works


In [17]:
help(merge_instances)

Help on function merge_instances in module utils.merge_model_objects:

merge_instances(primary_object, *alias_objects, disable_signals=True)
    Merge several model instances into one, the `primary_object`.
    Use this function to merge model objects and migrate all of the related
    fields from the alias objects the primary object.
    Usage:
        from django.contrib.auth.models import User
        primary_user = User.objects.get(email='good@example.com')
        duplicate_user = User.objects.get(email='good+duplicate@example.com')
        merge(primary_user, duplicate_user)
    Based on: https://djangosnippets.org/snippets/382/
    Based on https://djangosnippets.org/snippets/2283/



# Run merge operations

In [18]:

def sort_images(images):
    def key_func(im):
        return -im.usage, -len(im.exif_data), im.created
    return sorted(
        images.annotate(usage=models.Count('storyimage')),
        key=key_func,
    )

def merge_duplicates(qs, attrs=('id',), sort_func=None):
    proc = tqdm(qs)
    for item in proc: 
        proc.set_description_str(f'{str(item)[:30]:<30} ')
        kwargs = {attr: getattr(item, attr) for attr in attrs} 
        clones = qs.filter(**kwargs)
        if len(clones) > 1:
            proc.set_postfix(item=str(item.pk), clones=len(clones)-1)
            if sort_func:
                clones = sort_func(clones)
            merge_instances(*clones)
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')
    
def merge_images_by_field(field='imagehash', qs = ImageFile.objects.all()):
    proc = tqdm(qs)
    for item in proc: 
        proc.set_description_str(f'{str(item)[:30]:<30} ')
        clones = item.similar(field) | qs.filter(pk=item.pk) 
        if len(clones) > 1:
            proc.set_postfix(item=str(item.pk), clones=len(clones)-1)
            merge_instances(*sort_images(clones))
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')
            
def merge_bylines():
    attrs = ['story', 'contributor', 'credit']
    qs = Byline.objects.all()
    merge_duplicates(qs, attrs)
    
def merge_images_by_md5():
    attrs = ['_md5']
    qs = ImageFile.objects.all()
    merge_duplicates(qs, attrs, sort_func=sort_images)
    
    
def _clone(*items):
    for item in items: 
        item.pk = None
        item.save()

def test_merge():
    _clone(*Byline.objects.order_by('?')[:3])
    merge_bylines()
    
    _clone(*ImageFile.objects.order_by('?')[:3])
    merge_images_by_field('md5')
    
    _clone(*ImageFile.objects.order_by('?')[:3])
    merge_images_by_field('imagehash')



def dupes(qs, attr):
    vals = qs.values_list(attr, flat=True)
    dupes = [h for h, n in Counter(vals).most_common() if n > 1]
    return qs.filter(**{f'{attr}__in': dupes})


In [19]:
merge_images_by_field('imagehash', dupes(ImageFile.objects.all(), '_imagehash'))

28-NYH-royking-OA-01.jpg       : 100%|██████████| 336/336 [01:50<00:00,  3.04it/s, clones=1, item=46192]
INFO: ImageFile count: 336 -> 156


In [20]:
def fuzz_diff(a, attr):
    def fn(b):
        return fuzz.ratio(getattr(a, attr), getattr(b, attr))
    return fn
        

def merge_contributors(cutoff=90):
    """find and merge contributors."""
    qs = Contributor.objects.order_by('-pk')
    for n, item in enumerate(qs):
        clones = qs.filter(display_name__trigram_similar=item.display_name)
        if len(clones) > 1:
            ratio = fuzz_diff(item, 'display_name')
            clones = [c for c in clones if ratio(c) > cutoff]
            clones.sort(key=keyfn, reverse=True)
        if len(clones) > 1:
            msg = f'{str(n):<5}: merge ' + ' + '.join(f'{c}({ratio(c)} {c.bylines_count()})' for c in clones)
            logger.info(msg)
            merge_instances(*clones)
            
    logger.info(f'{qs.model.__qualname__} count: {qs.count()} -> {qs.all().count()}')

def keyfn(cn):
    return (cn.bylines_count(), bool(cn.email))

merge_contributors()

INFO: 0    : merge Lena Nielsen(100 48) + Lena Nilsen(96 3)
INFO: 64   : merge Stig Siversten(93 2) + Stig Sivertsen(100 1)
INFO: 65   : merge Kevin Sivertsen(100 2) + Kevin Sivertsen(100 1)
INFO: 72   : merge Arnved Nedkvitne(100 8) + Arnved Nedkvitne(100 1)
INFO: 79   : merge Pia Ottilia Danielsen(100 3) + Pia Ottilia Danielsen(100 1)
INFO: 101  : merge Mari Mamre(100 8) + Mari Mamre(100 1)
INFO: 316  : merge Ingrid Os(100 1) + Ingrid Os(100 1)
INFO: 322  : merge Angélique Culvin(100 39) + Angélique Culvin(100 4)
INFO: 909  : merge Skjalg Bøhmer Vold(97 260) + Sjalg Bøhmer Vold(100 1)
INFO: 1676 : merge Ellen Munden Paalgard(98 26) + Ellen Munden Paalgaard(100 4)
INFO: 1764 : merge Jamal Mohsin(100 2) + Jamal Mohsin(100 1)
INFO: 1886 : merge Michael Brøndbo(100 209) + Michael Brøndbo(100 4)
INFO: Contributor count: 2598 -> 2586


In [21]:
cn = Contributor.objects.first()
cn.bylines_count()


108

In [22]:
for cn in Contributor.objects.all():
    if not (10 < len(cn.display_name) < 20):
        continue
    matches = Contributor.objects.search(str(cn)).exclude(pk=cn.pk)
    if 0 < len(matches) < 5:
        print(f'{str(cn.pk):<5} {str(cn):<40}  ->  ', ','.join(f'{str(m.pk):<5} {m}' for m in matches))
    

14589 Raisa Porsanger                           ->   14590 Raisa Porsanger Og Privat
12309 Bjørn-Egil Mikalsen                       ->   12351 Bjørn–Egil Mikalsen Ill: Mads Sivertsen
14620 Malin Kvande                              ->   14594 Journalist Malin Kvande
12304 Mari Lund Haanshus                        ->   13465 Illustrasjon: Mari Lund Haanshus
14554 Regine Stokstad                           ->   14955 Regine Stokstad Studenter trenger
14610 Angélique Culvin                          ->   14697 Sylvia-Angélique Culvin
12389 Gunnar Thorenfeldt                        ->   12458 Gunnar Thorenfeldt Og Andreas
12393 Håkon Mosvold                             ->   12407 Håkon Mosvold Larsen
12592 Klaus Høiland                             ->   12497 Klaus Høiland Professor I Biologi
12597 Toril Roberg                              ->   12509 Toril Roberg. Politisk Viseformann
12581 Annette Birkeland                         ->   12547 Annette Birkeland. Leder Moderatgruppe
12574 Annet

In [24]:
names = [
    "Anette Remme",
    "Angélique Culvin",
    "Annette Birkeland",
    "Annette Orre",
    "Arnved Nedkvitne",
    "Bjarne Hodne",
    "Bjørn Erik Rasch",
    "Bjørn Haugstad",
    "Cecilia Johansen",
    "Cecilie Kappelslåen",
    "Egil Ellenes",
    "Ellinor Bent Dalbye",
    "Geir Wiknes",
    "Gunhild Oland",
    "Håkon Mosvold",
    "Inga Bostad",
    "Ingvild Skogvold",
    "Jamal Mohsin",
    "Jonas Holmqvist",
    "Julie Brundtland",
    "Kai A. Olsen",
    "Kevin Sivertsen",
    "Kjartan Almenning",
    "Klaus Høiland",
    "Lene C. Westgaard",
    "Malin Kvande",
    "Mari Lund Haanshus",
    "Michael Brøndbo",
    "Nordis Tennes",
    "Pia Sandved Berg",
    "Raisa Porsanger",
    "Regine Stokstad",
    "Tony Burner",
    "Toril Roberg",
    "Torstein Lindstad",
    "Trygve Wyller",
]
for name in names:
    master = Contributor.objects.filter(display_name=name)[0]
    matches = Contributor.objects.search(name).exclude(pk=master.pk)
    if matches.count():
        merge_instances(master, *list(matches))
        print(master)

Anette Remme
Angélique Culvin
Annette Birkeland
Annette Orre
Bjarne Hodne
Bjørn Erik Rasch
Bjørn Haugstad
Cecilia Johansen
Cecilie Kappelslåen
Egil Ellenes
Ellinor Bent Dalbye
Geir Wiknes
Gunhild Oland
Håkon Mosvold
Inga Bostad
Ingvild Skogvold
Jonas Holmqvist
Julie Brundtland
Kai A. Olsen
Kjartan Almenning
Klaus Høiland
Lene C. Westgaard
Malin Kvande
Mari Lund Haanshus
Nordis Tennes
Pia Sandved Berg
Raisa Porsanger
Regine Stokstad
Tony Burner
Toril Roberg
Torstein Lindstad
Trygve Wyller
