## Manually matching authors

This is not a well documented or illuminating notebook. It's a side thread that I pursued in order to standardize author names, using a combination of coding and elbow grease.

Basically the idea was to find pairs of names that *might* match, and then scan through the list manually confirming or rejecting the pairings.

Considering every possible pair would be exhausting and impossible. To make the task manageable, I sorted pairs by a score that combined the closeness of the names and the sheer *number* of volumes affected. The formula was

    num_of_vols_by_A * num_of_vols_by_B * AB_match^2

I only considered the top hundred pairs or so. The manually confirmed list of pairs preserved in **manual_author_matches.tsv**, where the first column is an alias and the second column is the name to be preferred. Note that I did this all before realizing that everything needed to pass through [uncode normalization](https://stackoverflow.com/questions/16467479/normalizing-unicode). In using **manual_author_matches** I will also in practice add that normalization.

In [1]:
import pandas as pd
from difflib import SequenceMatcher
from collections import Counter

In [3]:
meta = pd.read_csv('../manifestationmeta.tsv', sep = '\t', low_memory = False)

In [4]:
names = Counter()
for idx in meta.index:
    auth = meta.loc[idx, 'author']
    names[auth] += 1

In [14]:
blocks = dict()
for n in names.keys():
    if pd.isnull(n) or len(n) < 2:
        continue
    else:
        block_code = n[0:2]
        if block_code not in blocks:
            blocks[block_code] = []
        blocks[block_code].append(n)

print(len(blocks))

720


In [20]:
def probablymatch(str1, str2):
    
    m = SequenceMatcher(None, str1, str2)
    match = m.real_quick_ratio()
    if match > 0.8:
        match = m.ratio()
    
    return match

def get_simtuples(blocks, names):
    
    simtuples = []

    ctr = 0
    for code, block in blocks.items():
        ctr += 1
        if ctr % 10 == 1:
            print(ctr)
            
        for n1 in block:
            ct1 = names[n1]

            for n2 in block:
                
                ct2 = names[n2]
                if n1 == n2:
                    continue

                match = probablymatch(n1, n2)
                if match > 0.85:
                    similarity = match * match * ct1 * ct2
                    atuple = (similarity, match, ct1, ct2, n1, n2)
                    simtuples.append(atuple)
    
    return simtuples

simtuples = get_simtuples(blocks, names)



1
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191
201
211
221
231
241
251
261
271
281
291
301
311
321
331
341
351
361
371
381
391
401
411
421
431
441
451
461
471
481
491
501
511
521
531
541
551
561
571
581
591
601
611
621
631
641
651
661
671
681
691
701
711


In [23]:
simtuples = sorted(simtuples, reverse = True)

In [24]:
simtuples[0: 20]

[(117571.91836734694,
  0.9142857142857143,
  485,
  290,
  'Balzac, Honoré de',
  'Balzac, Honoré de'),
 (117571.91836734692,
  0.9142857142857143,
  290,
  485,
  'Balzac, Honoré de',
  'Balzac, Honoré de'),
 (78174.81481481482,
  0.8888888888888888,
  485,
  204,
  'Balzac, Honoré de',
  'Balzac, Honor?? de'),
 (78174.8148148148,
  0.8888888888888888,
  204,
  485,
  'Balzac, Honor?? de',
  'Balzac, Honoré de'),
 (49452.93061224489,
  0.9142857142857143,
  290,
  204,
  'Balzac, Honoré de',
  'Balzac, Honor?? de'),
 (49452.93061224489,
  0.9142857142857143,
  204,
  290,
  'Balzac, Honor?? de',
  'Balzac, Honoré de'),
 (17675.89777777778,
  0.8666666666666667,
  101,
  233,
  'King, Charles',
  'Kingsley, Charles'),
 (17675.897777777776,
  0.8666666666666667,
  233,
  101,
  'Kingsley, Charles',
  'King, Charles'),
 (15152.972972972975,
  0.918918918918919,
  485,
  37,
  'Balzac, Honoré de',
  'Balzac, Honore   de'),
 (15152.972972972973,
  0.918918918918919,
  37,
  485,
  'Balzac

In [26]:
with open('simtuples.tsv', mode = 'w', encoding = 'utf-8') as f:
    for s in simtuples:
        stringlist = [str(x) for x in s]
        line = '\t'.join(stringlist) + '\n'
        f.write(line)

In [32]:
matched_already = set()
equivalents = dict()
for s in simtuples:
    similarity, match, ct1, ct2, n1, n2 = s
    if (n2, n1) in matched_already:
        continue
    else:
        print(n1 + ' ' + str(ct1) + ' | ' + n2 + ' ' + str(ct2))
        response = input('?')
        if response == '1':
            equivalents[n2] = n1
        elif response == '2':
            equivalents[n1] = n2
        elif response == 'stop':
            with open('manual_author_matches.tsv', mode = 'a', encoding = 'utf-8') as f:
                f.write('alias\trealname\n')
                for k, v in equivalents.items():
                    f.write(k + '\t' + v + '\n')
            break
        elif len(response) > 2:
            equivalents[n1] = response
            equivalents[n2] = response
        
        matched_already.add((n1, n2))

Balzac, Honoré de 485 | Balzac, Honoré de 290


KeyboardInterrupt: 

In [33]:
# what about situations where the first word throws us off
# by putting author names in different blocks?
# We need a block list based on the second

secondblocks = dict()

for n in names.keys():
    if pd.isnull(n):
        continue
    
    name = n.replace(',', ' ')
    # in case things are glued by a comma
    
    words = name.split()
    if len(words) < 2:
        continue
    if len(words[1]) < 2:
        continue
    else:
        block_code = words[1][0:2]
        if block_code not in secondblocks:
            secondblocks[block_code] = []
        secondblocks[block_code].append(n)

print(len(secondblocks))

def get_misaligned_simtuples(blocks, secondblocks, names):
    
    simtuples = []

    ctr = 0
    for code, block in blocks.items():
        ctr += 1
        if ctr % 10 == 1:
            print(ctr)
            
        if code in secondblocks:
            secondblock = secondblocks[code]
        else:
            continue
            
        for n1 in block:
            ct1 = names[n1]

            for n2 in secondblock:
                
                ct2 = names[n2]
                if n1 == n2:
                    continue

                match = probablymatch(n1, n2)
                if match > 0.9:
                    # note higher threshold here
                    # matches are going to be rare
                    
                    similarity = match * match * ct1 * ct2
                    atuple = (similarity, match, ct1, ct2, n1, n2)
                    simtuples.append(atuple)
    
    return simtuples

misaligned_simtuples = get_misaligned_simtuples(blocks, secondblocks, names)

661
1
11
21
31
41
51
61
71
81
91
101
111
121
131
141
151
161
171
181
191
201
211
221
231
241
251
261
271
281
291
301
311
321
331
341
351
361
371
381
391
401
411
421
431
441
451
461
471
481
491
501
511
521
531
541
551
561
571
581
591
601
611
621
631
641
651
661
671
681
691
701
711


In [41]:
matched_already = set()
equivalents = dict()
for s in misaligned_simtuples:
    similarity, match, ct1, ct2, n1, n2 = s
    if (n2, n1) in matched_already:
        continue
    else:
        print(n1 + ' ' + str(ct1) + ' | ' + n2 + ' ' + str(ct2))
        response = input('?')
        if response == '1':
            equivalents[n2] = n1
        elif response == '2':
            equivalents[n1] = n2
        elif response == 'stop':
            with open('manual_author_matches.tsv', mode = 'a', encoding = 'utf-8') as f:
                for k, v in equivalents.items():
                    f.write(k + '\t' + v + '\n')
            break
        elif len(response) > 2:
            equivalents[n1] = response
            equivalents[n2] = response
        
        matched_already.add((n1, n2))

Dafoe, Daniel 1 | Defoe, Daniel 765
?2
Greene, Graham 148 | Green, Graham 1
?
Bjørnson, Bjørnstjerne 76 | Bjr̜nson, Bjr̜nstjerne 1
?1
Herbert, Henry William 17 | Weber, Henry William 3
?
Bj??rnson, Bj??rnstjerne 22 | Bjr??nson, Bjr??nstjerne 1
?Bjørnson, Bjørnstjerne
Bj??rnson, Bj??rnstjerne 22 | Bjr??nson, Bj?_rnstjerne 1
?Bjørnson, Bjørnstjerne
MacCarthy, Mary 1 | McCarthy, Mary 19
?
Brougham and Vaux, Henry Brougham 2 | Baron, Brougham and Vaux, Henry Brougham 6
?1
Williams, William 5 | Williams, William G 2
?
Williams, William 5 | Willis, William 2
?
[Thomas, Lida Larrimore (Turner) Mrs.] 1 | ] [Thomas, Lida Larrimore (Turner), Mrs 7
?Thomas, Lida Larrimore (Turner), Mrs
Lamington, Alexander Dundas Ross Wishart Cochrane-Baillie 6 | baron, Lamington, Alexander Dundas Ross Wishart Cochrane-Baillie 1
?1
Lamington, Alexander Dundas Ross Wishart Cochrane-Baillie 6 | Baron, Lamington, Alexander Dundas Ross Wishart Cochrane-Baillie 1
?1
Mahy, Margaret 3 | Mayo, Margaret 2
?
Villaseñor, Vi

In [38]:
len(misaligned_simtuples)

68

In [39]:
misaligned_simtuples.sort(reverse = True)

In [40]:
misaligned_simtuples


[(651.8343195266273,
  0.9230769230769231,
  1,
  765,
  'Dafoe, Daniel',
  'Defoe, Daniel'),
 (137.24005486968449,
  0.9629629629629629,
  148,
  1,
  'Greene, Graham',
  'Green, Graham'),
 (137.24005486968449,
  0.9629629629629629,
  1,
  148,
  'Green, Graham',
  'Greene, Graham'),
 (62.809917355371894,
  0.9090909090909091,
  76,
  1,
  'Bjørnson, Bjørnstjerne',
  'Bjr̜nson, Bjr̜nstjerne'),
 (62.809917355371894,
  0.9090909090909091,
  1,
  76,
  'Bjr̜nson, Bjr̜nstjerne',
  'Bjørnson, Bjørnstjerne'),
 (41.74829931972789,
  0.9047619047619048,
  17,
  3,
  'Herbert, Henry William',
  'Weber, Henry William'),
 (18.486111111111107,
  0.9166666666666666,
  22,
  1,
  'Bj??rnson, Bj??rnstjerne',
  'Bjr??nson, Bjr??nstjerne'),
 (18.486111111111107,
  0.9166666666666666,
  22,
  1,
  'Bj??rnson, Bj??rnstjerne',
  'Bjr??nson, Bj?_rnstjerne'),
 (18.486111111111107,
  0.9166666666666666,
  1,
  22,
  'Bjr??nson, Bjr??nstjerne',
  'Bj??rnson, Bj??rnstjerne'),
 (18.486111111111107,
  0.9166666