# Parse Copyright 2

clone to parallelize parse_copyright

In [1]:
import pandas as pd
from difflib import SequenceMatcher
import re

In [2]:
df = pd.read_csv('/Users/tunder/work/copyright/tabularcopyright.tsv', sep = '\t', low_memory = False)

In [3]:
titles = pd.read_csv('../noveltmmeta/metadata/titlemeta.tsv', sep = '\t', low_memory = False)
titles = titles.loc[titles.latestcomp > 1920, : ]
titles.shape

(75929, 30)

In [4]:
def getinitial(astring):
    if pd.isnull(astring):
        return('xx')
    astring = re.sub(r'[^\w\s]', '', astring.lower())
    if len(astring) < 1:
        return 'xx'
    elif len(astring) == 1:
        return astring.lower()
    else:
        return astring.lower()[0:2]

titles['initial'] = titles['author'].map(getinitial)
titles.head()

Unnamed: 0,docid,oldauthor,author,authordate,inferreddate,latestcomp,datetype,startdate,enddate,imprint,...,copiesin25yrs,enumcron,volnum,title,parttitle,earlyedition,shorttitle,nonficprob,juvenileprob,initial
6,mdp.39015059414725,"McKenna, Stephen","McKenna, Stephen",1888-1967.,,1967.0,n,,,London|Hutchinson & co.|n.d.,...,1.0,,,"Lady Lilith, | a novel: Beong the first part o...",,True,"Lady Lilith, a novel: Beong the first part of ...",0.165138,0.001126,mc
7,mdp.39015063920006,"Haggard, H. Rider (Henry Rider)","Haggard, H. Rider (Henry Rider)",1856-1925.,,1925.0,n,,,New York|J. S. Ogilvie|n.d.,...,1.0,,,Beatrice | [a novel] | $c: by H. Rider Haggard.,,True,Beatrice [a novel],0.070872,0.001023,ha
8,mdp.39015035876971,"Malet, Lucas","Malet, Lucas",1852-1931.,,1931.0,n,,,New York|T. Y. Crowell & co.|n.d.,...,1.0,,,Little Peter: | a Christmas morality for child...,,True,Little Peter: a Christmas morality for childre...,0.094639,0.512603,ma
13,mdp.39015063933132,"Lawrence, Charles Edward","Lawrence, Charles Edward",1870-1940.,,1940.0,n,,,New York|E. P. Dutton & con.|n.d.,...,1.0,,,"The god in the thicket, | $c: by C. E. Lawrence.",,True,The god in the thicket,0.066802,0.336583,la
21,mdp.39015063548997,"Glyn, Elinor","Glyn, Elinor",1864-1943.,,1943.0,n,,,New York|Boston|H. M. Caldwell co.|n.d.,...,1.0,,,Visits of Elizabeth | $c: [by] Elinor Glyn.,,True,Visits of Elizabeth,0.105119,0.220587,gl


In [5]:
def get_ratio(stringA, stringB):

    '''
    A generic function to get fuzzy similarity between two strings.
    '''

    m = SequenceMatcher(None, stringA, stringB)

    thefilter = m.real_quick_ratio()

    if thefilter < 0.75:
        return thefilter

    else:
        return m.ratio()

def title_compare(stringA, stringB):
    '''
    When searching for the similarity between two titles, we use this modified comparison,
    which gives a boost to the similarity when it applies to two long strings, but also
    permits relatively high similarity in cases where one string is much longer than
    the other and only the overlapping starts are similar. This is useful because there
    are often variant fiction titles like

    Shane
    and
    Shane: A Story of the American West.
    '''
    
    stringA = stringA.lower().replace('the ', 'x') # we replace 'the' because it's a word that
    stringB = stringB.lower().replace('the ', 'x') # is relatively long for how common it is,
                                                    # and also quite likely to appear at the start
                                                    # of a title, producing false matches

    minlen = min(len(stringA), len(stringB))
    maxlen = max(len(stringA), len(stringB))
    
    diffpenalty = (maxlen - minlen) / 300

    if minlen > 3:
        stringA = stringA[0: minlen]
        stringB = stringB[0: minlen]

    if minlen < 25:
        lendiscount = 1.2 - (((25 - minlen) ** 1.1) / 100)
    else:
        lendiscount = 1.2

    m = SequenceMatcher(None, stringA, stringB)

    thefilter = m.quick_ratio()
    
    if thefilter < 0.5:
        return round((thefilter * lendiscount) - diffpenalty, 4)

    else:
        return round((m.ratio() * lendiscount) - diffpenalty, 4)

def given_name_similarity(namesA, namesB):
    initialsA = set([x[0] for x in namesA])
    initialsB = set([x[0] for x in namesB])

    overlap = len(initialsA.intersection(initialsB))
    difference = len(initialsA.symmetric_difference(initialsB))

    surplus = overlap - difference

    if surplus < 1:
        return surplus * .04
    else:
        for name in namesA:
            if len(name) > 2 and name in namesB:
                surplus += 1

    return surplus * .04

In [6]:
matches = []
matchedalready = set()
ctr = 0

for year in range(1960, 1970):
    print('year', year)
    dfslice = df.loc[df.copydate == year, : ]
    titlesneardate = titles.loc[(titles.latestcomp > (year -3)) & (titles.latestcomp < (year + 30)), : ]
    blocks = dict()
    
    for idx, row in dfslice.iterrows():
        ctr += 1
        if ctr % 1000 == 1:
            print(ctr)

        title = row['title']

        if not pd.isnull(title):
            title = title.lower()
        else:
            continue
        title = re.sub(r'[^\w\s]', '', title)  # remove punctuation

        author = row['authors']

        if not pd.isnull(author):
            author = author.lower().split('|')[0]
        else:
            continue

        author = re.sub(r'[^\w\s]', '', author)

        names = author.split()

        if len(names) > 0:
            surname = names[0]
            last_initial = getinitial(surname)
        else:
            continue

        if len(names) > 1:
            given_names = names[1: ]
        else:
            given_names = []

        if last_initial in blocks:
            block = blocks[last_initial]
        else:
            block = titlesneardate.loc[titlesneardate['initial'] == last_initial, : ]
            blocks[last_initial] = block

        for idx, matchrow in block.iterrows():

            m_author = matchrow['author']

            if not pd.isnull(m_author):
                m_author = m_author.lower()
            else:
                continue

            m_author = re.sub(r'[^\w\s]', '', m_author)

            m_names = m_author.split()

            if len(m_names) > 0:
                m_surname = m_names[0]
            else:
                continue

            surname_match = get_ratio(surname, m_surname)

            if surname_match < .85:
                continue

            if len(m_names) > 1:
                m_given_names = m_names[1: ]
            else:
                m_given_names = []

            if len(m_given_names) > 0:
                given_supp = given_name_similarity(given_names, m_given_names)
            else:
                given_supp = 0

            m_title = matchrow['shorttitle']
            if pd.isnull(m_title):
                continue

            m_title = m_title.lower()
            m_title = re.sub(r'[^\w\s]', '', m_title)

            title_match = title_compare(title, m_title)

            if (surname_match + given_supp) > .98 and title_match > .95:

                matchrec = dict()
                matchrec['copy_author'] = author
                matchrec['copy_title'] = title
                matchrec['hathi_author'] = m_author
                matchrec['hathi_title'] = m_title
                matchrec['surname_match'] = surname_match
                matchrec['given_supp'] = given_supp
                matchrec['title_match'] = title_match
                matchrec['docid'] = matchrow['docid']
                matchrec['copy_date'] = row['copydate']
                matchrec['hathi_date'] = matchrow['latestcomp']
                
                matches.append(matchrec)
               
                matchlen = len(matches)
                if matchlen % 100 == 1:
                    print(len(matches), 'matches')
                matchedalready.add(matchrec['docid'])
    
    # titles = titles.loc[~titles.docid.isin(matchedalready)]
    print('Matched so far: ', len(matchedalready))
    
matched_df = pd.DataFrame(matches)
outdf = matched_df[['copy_author', 'copy_title', 'hathi_author', 'hathi_title', 'title_match', 'surname_match', 'given_supp', 'docid', 'copy_date', 'hathi_date']]
outdf.to_csv('copymatches_1960s.tsv', sep= '\t', index = False)

year 1960
1
1 matches
1001
2001
3001
4001
5001
6001
7001
8001
101 matches
9001
10001
11001
12001
13001
14001
15001
201 matches
16001
17001
18001
19001
20001
21001
22001
23001
301 matches
24001
25001
26001
27001
28001
29001
30001
31001
Matched so far:  382
year 1961
32001
401 matches
33001
34001
35001
36001
37001
38001
39001
40001
501 matches
41001
42001
43001
44001
45001
46001
47001
48001
49001
601 matches
50001
51001
52001
53001
54001
55001
56001
57001
58001
59001
701 matches
60001
Matched so far:  713
year 1962
61001
62001
63001
64001
65001
66001
67001
68001
69001
801 matches
70001
71001
72001
73001
74001
75001
76001
77001
78001
79001
901 matches
80001
81001
82001
83001
84001
85001
86001
87001
88001
1001 matches
89001
90001
91001
Matched so far:  1021
year 1963
92001
93001
94001
95001
96001
97001
1101 matches
98001
99001
100001
101001
102001
103001
104001
105001
106001
1201 matches
107001
108001
109001
110001
111001
112001
113001
114001
115001
1301 matches
116001
117001
118001
119001