# Fuzzy-matching canonical volumes

Richard So provided a list of American novels canonized by Norton or Heath. This notebook does some fuzzy matching to identify corresponding Hathi volumes. We use deduplicated metadata in order to get the earliest available copy.

In [1]:
import pandas as pd
from difflib import SequenceMatcher
from collections import Counter

In [3]:
norton = pd.read_csv('canon/NORTON_HEATH_CANON_TEXTS.csv')
norton.head()

Unnamed: 0,AUTHOR,TITLE,PERIOD,VOL
0,Washington Irving,Rip Van Winkle,,1.0
1,Washington Irving,Legend of Sleepy Hollow,,1.0
2,James Fenimore Cooper,The Pioneers,,1.0
3,James Fenimore Cooper,The Last of the Mohicans,,1.0
4,Lydia Maria Child,Letters from New York,,1.0


In [27]:
work = pd.read_csv('../../noveltmmeta/workmeta.tsv', sep = '\t', low_memory = False)
work.shape

(138137, 28)

### Create "blocks" of records for speedier processing

Fuzzy matching can be pretty slow if you have to check each author/title combination against the whole list of 138,000 records. So it's conventional to group records into "blocks." Here we'll use the first two letters of the author's last name.

In reality, the number of books we're dealing with here isn't overwhelming, but I'm in the habit of doing this, and it sets a good example.

In [28]:
def blockcode(aname):
    if pd.isnull(aname):
        block = 'xx'
    elif len(aname) < 3:
        block = 'nn'
    else:
        block = aname.lower()[0:2]
    
    return block

work = work.assign(block = work.author.map(blockcode))

byblock = work.groupby('block')
block_dictionary = dict()

for code, df in byblock:
    block_dictionary[code] = df

### Now the actual matching

We check author first, then title. If both similarities are above 0.8 we add the record to a list. After sorting, we take the top match.

In [54]:
def interpret_tag(volnum):
    try:
        volnum = int(volnum)
    except:
        volnum = 1
        print('error in source')
        
    if volnum == 1:
        return 'norton'
    elif volnum == 2:
        return 'nortonshort'
    elif volnum == 3:
        return 'heath'
    else:
        print('This should never happen.')

def fuzzymatch(str1, str2):
    
    m = SequenceMatcher(None, str1, str2)
    match = m.real_quick_ratio()
    if match > 0.7:
        match = m.ratio()
    
    return match

allmatches = []

for idx, row in norton.iterrows():
    # we need to put authors in LastName, Firstname order
    authnames = row['AUTHOR'].split()
    last = authnames[-1]
    normname = last + ', ' + ' '.join(authnames[0:-1])
    normname = normname.lower()
    
    title = row['TITLE'].lower()
    if len(title) > 28:
        title = title[0:28]
    code = normname[0:2]
    
    tag = interpret_tag(row['VOL'])
    
    possiblematches = []
    
    if code in block_dictionary:
        block = block_dictionary[code]
        for idx2, row2 in block.iterrows():
            auth2match = row2['author'].lower()
            if '(' in auth2match:
                auth2match = auth2match.split('(')[0]
            authsimilarity = fuzzymatch(normname, auth2match)
            if authsimilarity < 0.74: 
                continue
            
            title2match = row2['shorttitle']
            if pd.isnull(title2match):
                title2match = 'x x x'
            else:
                title2match = title2match.lower()
                if len(title2match) > 28:
                    title2match = title2match[0:28]
                
            titlesimilarity = fuzzymatch(title, title2match)
            if titlesimilarity > 0.7:
                possiblematches.append((authsimilarity * titlesimilarity, idx2, tag))

    if len(possiblematches) > 0:
        possiblematches.sort()
        probability, matchidx, tag = possiblematches[-1]
        
        print(normname, ' | ', title, block.loc[matchidx, 'author'], ' == ', block.loc[matchidx, 'shorttitle'])
        allmatches.append((block.loc[matchidx, 'docid'], tag))
    else:
        print(normname, ' || ', title, "NO MATCH", tag)
        
                
                
            

irving, washington  |  rip van winkle Irving, Washington  ==  Rip Van Winkle
irving, washington  |  legend of sleepy hollow Irving, Washington  ==  The legend of Sleepy Hollow, and The spectre bridegroom. From the "Sketch book"
cooper, james fenimore  |  the pioneers Cooper, James Fenimore  ==  The pioneers
cooper, james fenimore  |  the last of the mohicans Cooper, James Fenimore  ==  The last of the Mohicans : a narrative of 1757
child, lydia maria  |  letters from new york Child, Lydia Maria Francis  ==  Letters from New York : second series
emerson, ralph waldo  ||  nature NO MATCH norton
emerson, ralph waldo  ||  the american scholar NO MATCH norton
hawthorne, nathaniel  |  the scarlet letter Hawthorne, Nathaniel  ==  The scarlet letter
poe, edgar allan  ||  the purloined letter NO MATCH norton
poe, edgar allan  ||  the tell-tale heart NO MATCH norton
poe, edgar allan  |  the fall of the house of ush Poe, Edgar Allan  ==  The fall of the house of Usher : and other tales and prose 

kerouac, jack  ||  big sur NO MATCH norton
vonnegut, kurt  |  slaughterhouse-five Vonnegut, Kurt  ==  Slaughterhouse-five; or, The children's crusade : a duty-dance with death
paley, grace  ||  a conversation with my fathe NO MATCH norton
dickey, james  ||  drowning with others NO MATCH norton
dickey, james  ||  the heaven of animals NO MATCH norton
dickey, james  ||  falling NO MATCH norton
thompson, hunter s  ||  fear and loathing in las veg NO MATCH norton
baldwin, james  |  going to meet the man Baldwin, James  ==  Going to meet the man
o'connor, flannery  ||  the life you save may be you NO MATCH norton
o'connor, flannery  ||  good country people NO MATCH norton
leguin, ursula k  ||  schrodinger's cat NO MATCH norton
leguin, ursula k  ||  she unnames them NO MATCH norton
morrison, toni  ||  recitatif NO MATCH norton
updike, john  ||  separating NO MATCH norton
roth, philip  ||  defender of the faith NO MATCH norton
momaday, n scott  |  the way to rainy mountain Momaday, Natachee S

melville, herman  ||  benito cereno NO MATCH heath
melville, herman  ||  billy budd NO MATCH heath
cary, alice  ||  clovernook NO MATCH heath
stoddard, elizabeth  ||  lemorne versus huell NO MATCH heath
davis, rebecca harding  ||  life in the iron-mills NO MATCH heath
twain, mark  |  roughing it Twain, Mark  ==  Roughing it
twain, mark  ||  a true story NO MATCH heath
twain, mark  ||  the autobiography of mark tw NO MATCH heath
harris, joel chandler  |  uncle remus Harris, Joel Chandler  ==  Uncle Remus
harris, joel chandler  |  free joe Harris, Joel Chandler  ==  Free Joe
chesnutt, charles w  ||  the goophered grapevine NO MATCH heath
chesnutt, charles w  ||  the passing of grandison NO MATCH heath
chesnutt, charles w  |  the wife of his youth Chesnutt, Charles W. (Charles Waddell)  ==  The wife of his youth
cable, george washington  ||  tite poulette NO MATCH heath
dunbar-nelson, alice  ||  sister josepha NO MATCH heath
burton, maria amparo ruiz de  |  the squatter and the don Burton

mason, bobbie ann  ||  airwaves NO MATCH heath
ortiz, simon  ||  sand creek NO MATCH heath
wideman, john edgar  ||  valaida NO MATCH heath
anzaldua, gloria  ||  borderlands NO MATCH heath
smith, lee  ||  the bubba stories NO MATCH heath
rodriguez, richard  ||  the hunger of memory NO MATCH heath
walker, alice  ||  laurel NO MATCH heath
silko, leslie marmon  ||  lullaby NO MATCH heath
hagedorn, jessica  ||  the blossoming of bongbong NO MATCH heath
allison, dorothy  ||  don't tell me you don't know NO MATCH heath
yamashita, karen tei  |  tropic of orange Yamashita, Karen Tei  ==  Tropic of orange : a novel
erdrich, louise  |  love medicine Erdrich, Louise  ==  Love medicine : a novel
viramontes, helena maria  ||  the cariboo caf_ NO MATCH heath
cisneros, sandra  ||  eleven NO MATCH heath
jen, gish  ||  in the american society NO MATCH heath
wallace, david foster  ||  the devil is a busy man NO MATCH heath
lee, chang-rae  ||  coming home again NO MATCH heath
alexie, sherman  ||  because 

In [43]:
print("Number of matches: ", len(allmatches))

Number of matches:  96
Distinct matches: 73


In [59]:
additionalmatches = [('wu.89098876212', 'norton'), ('wu.89098876212', 'heath'),
                     ('miun.abr7310.0001.001', 'nortonshort'), ('mdp.39015010526716', 'nortonshort'), ('nyp.33433076060734', 'norton'),
                    ('nc01.ark+=13960=t89g6f86r', 'norton'), ('pst.000028368182', 'heath'), ('nyp.33433076084221', 'heath'),
                    ('nyp.33433076079254', 'nortonshort')]

In [60]:
allmatches.extend(additionalmatches)

unique_ids = set([x[0] for x in allmatches])
tagdictionary = dict()
for anid, tag in allmatches:
    if anid not in tagdictionary:
        tagdictionary[anid] = set()
    tagdictionary[anid].add(tag)

In [62]:
matched = work.loc[work.docid.isin(unique_ids), : ]
matched.shape

(81, 29)

In [64]:
def map2parameter(anid):
    global tagdictionary
    global parameter2check
    
    if anid not in tagdictionary:
        return False
    elif parameter2check in tagdictionary[anid]:
        return True
    else:
        return False

parameter2check = 'norton'
matched = matched.assign(norton = matched.docid.map(map2parameter))
parameter2check = 'nortonshort'
matched = matched.assign(nortonshort = matched.docid.map(map2parameter))
parameter2check = 'heath'
matched = matched.assign(heath = matched.docid.map(map2parameter))
matched.head()

Unnamed: 0,docid,oldauthor,author,authordate,inferreddate,latestcomp,datetype,startdate,enddate,imprint,...,enumcron,volnum,title,parttitle,earlyedition,shorttitle,block,norton,nortonshort,heath
6824,nyp.33433074792726,"Kirkland, Caroline M. (Caroline Matilda)","Kirkland, Caroline M. (Caroline Matilda)",1801-1864.,1839,1839,s,1839,,New York;C. S. Francis;Boston;J. H. Francis;1839.,...,,,"A new home--who'll follow? : | or, Glimpses of...",,True,"A new home--who'll follow? : or, Glimpses of w...",ki,False,False,True
7463,yale.39002014432901,"Dana, Richard Henry","Dana, Richard Henry",1815-1882.,1841,1841,s,1841,,London;E. Moxon;1841.,...,,,Two years before the mast : | a personal narra...,,True,Two years before the mast : a personal narrati...,da,True,False,False
7485,uc2.ark+=13960=t3nv9ss49,"Sedgwick, Catharine Maria","Sedgwick, Catharine Maria",1789-1867.,1842,1842,s,1842,,New York;Harper & brothers;1842.,...,,,Hope Leslie;,,True,Hope Leslie;,se,False,False,True
8501,wu.89008428005,"Child, Lydia Maria","Child, Lydia Maria Francis",1802-1880.,1845,1845,t,1847,1845.0,"New York;C.S. Francis & Co.;1847, c1845.",...,,,Letters from New York : | second series / | $c...,,True,Letters from New York : second series,ch,True,False,True
9643,uva.x000469901,"Cooper, James Fenimore","Cooper, James Fenimore",1789-1851.,1850,1850,m,1850,1851.0,New York;G. P. Putnam;1850-1851.,...,V.4,4.0,The leather-stocking tales / | $c: By J. Fenim...,The pioneers,True,The pioneers,co,True,False,True


In [69]:
matched.to_csv('canon/fuzzy_matched_canon.tsv', sep = '\t', index = False)

In [66]:
best = pd.read_csv('bestsellers/found_bestsellers.csv')

In [67]:
best.shape


(883, 13)

In [68]:
best.drop_duplicates(subset = ['docid'], inplace = True)
best.shape

(849, 13)

In [70]:
best.head()

Unnamed: 0,docid,inferreddate,firstpub,best,recordid,author,imprint,enumcron,title,authordate,gender,nationality,notes
0,wu.89104493614,1876,1876.0,True,7092539,"Southworth, Emma Dorothy Eliza Nevitte,",London|Milner|18--?,,"Self-raised, or, From the depths",-,,,
1,mdp.39015009209035,1838,1838.0,True,393135,"Dickens, Charles,",Philadelphia|T. B. Peterson|185-?,v.1,Nicholas Nickleby,1812-,m,uk,
2,nyp.33433075749584,1864,1864.0,True,8669865,"Payn, James,",Philadelphia|T.B. Peterson & Bros.|187-?,,Lost Sir Massingberd,1830-,m,uk,
3,uc2.ark:/13960/t15m63z1c,1894,1894.0,True,7665326,"Maclaren, Ian,",Chicago|E. A. Weeks & Company|189-?,,Beside the bonnie brier bush,-,,,
4,mdp.39015059406689,1896,1896.0,True,160491,"Parker, Gilbert,",New York|D. Appleton and company|189,,The seats of the mighty;,-,,,


In [71]:
best.to_csv('bestsellers/deduplicated_bestsellers.tsv', sep = '\t', index = False)