## Second deduplication

This notebook begins with **manifestationmeta.tsv,** and moves toward a smaller dataset that aspires to contain only one copy of each "work," in [FRBR terminology.](https://en.wikipedia.org/wiki/Functional_Requirements_for_Bibliographic_Records) 

However, this reference to FRBR should not be taken very literally. In reality, we're just identifying (relatively) unique title-author pairs, which may or may not line up with "works" and "expressions." I say "relatively" because fuzzy matching is used to allow for minor variations in spelling and punctuation.


In [1]:
import pandas as pd
from difflib import SequenceMatcher


In [2]:
meta = pd.read_csv('manifestationmeta.tsv', sep = '\t', low_memory = False)

blocks = dict()
for idx in meta.index:
    name = meta.loc[idx, 'author']
    if pd.isnull(name) or len(name) < 3:
        name = 'nan'
        
    title = meta.loc[idx, 'shorttitle']
    
    # note that we use short titles, which means that we'll be using
    # the titles for individual volume parts when available
    
    if pd.isnull(title) or len(title) < 5:
        title = 'default'
    
    blockcode = name[0:3].lower() + title[0:5].lower()
    if blockcode not in blocks:
        blocks[blockcode] = set()
    
    blocks[blockcode].add(idx)

In [3]:
len(blocks)


89178

In [None]:
def probablymatch(str1, str2):
    if len(str1) > 25:
        str1 = str1[0: 25].lower()
    else:
        str1 = str1.lower()
        
    if len(str2) > 25:
        str2 = str2[0: 25].lower()
    else:
        str2 = str2.lower()
    
    m = SequenceMatcher(None, str1, str2)
    match = m.ratio()
    
    return match

groups = []
dubiouscalls = []

ctr = 0
for code, block in blocks.items():
    ctr += 1
    if ctr % 10000 == 1:
        print(ctr)
    
    already_checked = set()
    
    for b1 in block:
        matched = False
        for b2 in block:
            if b1 == b2:
                continue
            if (str(b1) + ' ' + str(b2)) in already_checked:
                continue
            
            auth1 = meta.loc[b1, 'author']
            auth2 = meta.loc[b2, 'author']
            title1 = meta.loc[b1, 'shorttitle']
            title2 = meta.loc[b2, 'shorttitle']
            
            if pd.isnull(auth1) or pd.isnull(auth2):
                continue
            if pd.isnull(title1) or pd.isnull(title2):
                continue
            
            if len(auth1) < 4 or len(auth2) < 4:
                continue
            if len(title1) < 5 or len(title2) < 5:
                continue
            
            if auth1 == auth2:
                authormatch = 1.0
            else:
                authormatch = probablymatch(auth1, auth2)
                if authormatch < 0.9:
                    continue
            
            if title1 == title2:
                titlematch = 1.0
            else:
                titlematch = probablymatch(title1, title2)
                if titlematch < 0.85:
                    continue
            
            if authormatch + titlematch < 1.87:
                continue
            elif authormatch + titlematch < 1.9:
                outline = auth1 + " | " + title1 + '\n' + auth2 + ' | ' + title2 + '\n' + str(authormatch + titlematch) + '\n'
                dubiouscalls.append(outline)           
            
            # we have a match!
            matched = True
            found = False
            for g in groups:
                if b1 in g or b2 in g:
                    g.add(b1)
                    g.add(b2)
                    found = True
                    break

            if not found:
                groups.append({b1, b2})
            
            already_checked.add(str(b2) + ' ' + str(b1))
            
        if not matched:
            groups.append({b1})
                
                

1
10001
20001

In [19]:
len(dubiouscalls)

8424

In [20]:
with open('dubiouscalls.txt', mode = 'w', encoding = 'utf-8') as f:
    for d in dubiouscalls:
        f.write(d)

In [21]:
len(groups)

146878

In [23]:
maxsize = 0
for g in groups:
    if len(g) > maxsize:
        maxsize = len(g)
print(maxsize)

317


In [33]:
newmeta = pd.read_csv('manifestationmeta.tsv', sep = '\t', low_memory = False, index_col = 'docid')

selected = []
ignored = []
errors = 0
instances = dict()

ctr = 0
for g in groups:
    ctr += 1
    if ctr % 10000 == 1:
        print(ctr)
        
    if len(g) == 1:
        for e in g:
            break
        selected.append(e)
        continue
    if len(g) < 1:
        errors += 1
        continue
    
    earliest = ''
    earliestdate = 2100
    for element in g:
        date = meta.loc[element, 'inferreddate']
        if pd.isnull(date):
            date = 2100
        else:
            date = int(date)
        
        if earliestdate == 2100 or date < earliestdate:
            earliestdate = date
            earliest = element
            if earliestdate < 1700:
                earliestdate = 2100
                # don't reward dubious dates
        
    record = meta.loc[earliest, 'recordid']
    title2match = str(meta.loc[earliest, 'shorttitle'])

    matching = []

    thisrec = meta.loc[meta.recordid == record, : ]
    for idx in thisrec.index:
        thistitle = str(thisrec.loc[idx, 'shorttitle'])
        match = probablymatch(title2match, thistitle)
        if match > 0.9:
            matching.append(idx)
    
    if len(matching) < 6:
        selected.extend(matching)
        docids = meta.loc[matching, 'docid']
        instanceseries = newmeta.loc[docids, 'instances']
        allinstances = sum(instanceseries)
        for m in matching:
            instances[m] = allinstances
    else:
        ignored.append((title2match, record))
        
            

1
10001
20001
30001
40001
50001
60001
70001
80001
90001
100001
110001
120001
130001
140001


  interactivity=interactivity, compiler=compiler, result=result)


In [36]:
print(len(groups))


146878


In [38]:
print(errors)

126156


In [30]:
ignored[0:20]

[('Scenes of Parisian life;', 1203519),
 ('Scenes of private life;', 1203519),
 ('Scenes of private life;', 1203519),
 ('Scenes of provincial life;', 1203519),
 ('Scenes of provincial life;', 1203519),
 ('Scenes of Parisian life', 7678129),
 ('The works of George Eliot', 8894081),
 ('The writings of George Eliot', 1419580),
 ("The world's one hundred best short stories", 6511333),
 ('Collected edition of the novels and tales', 8665191),
 ('Works', 7924576),
 ('"Captains courageous," a story of the Grand banks', 6156153),
 ('The works of Honoré de Balzac', 6142151),
 ('The works of Honoré de Balzac;', 1791774),
 ('The works of Honore   de Balzac', 8618533),
 ('The come  die humaine', 7126282),
 ('The comedy of human life;', 11261041),
 ('[Works]', 7924580),
 ('The novels and tales of Guy de Maupassant', 7569104),
 ('Romances and narratives', 432149)]

In [None]:
deduped