# Parse copyright data

In [48]:
import pandas as pd
import xml.etree.ElementTree as ET
import glob

In [53]:
entries = []

for folderyear in range(1923, 1970):
    ctr = 0
    xmlfiles = glob.glob('/Users/tunder/work/copyright/xml/' + str(folderyear) + '/*.xml')
    for afilepath in xmlfiles:
        tree = ET.parse(afilepath)
        root = tree.getroot()

        for child in root.findall('copyrightEntry'):
            authors = []
            for author in child.findall('author'):
                for authorname in author.findall('authorName'):
                    if authorname.text is not None:
                        authors.append(authorname.text)
            for title in child.findall('title'):
                if title.text is not None:
                    title = title.text
                else:
                    title = ''
            for regdate in child.findall('regDate'):
                try:
                    date = int(regdate.attrib['date'][0:4])
                except:
                    date = 0
                    print('exception')

            if len(authors) > 0 and len(title) > 0 and date > 0:  
                row = dict()
                row['authors'] = '|'.join(authors)
                row['title'] = title
                row['copydate'] = date
                entries.append(row)
                ctr += 1
    
    print(folderyear, ctr, len(entries))

1923 8614 8614
1924 8494 17108
1925 10533 27641
1926 10029 37670
1927 11634 49304
1928 13463 62767
1929 13333 76100
1930 13606 89706
1931 13463 103169
1932 11738 114907
1933 11369 126276
1934 11258 137534
1935 12398 149932
1936 12206 162138
1937 9122 171260
1938 14023 185283
1939 11915 197198
1940 10909 208107
1941 9990 218097
1942 7983 226080
1943 6580 232660
exception
1944 5960 238620
1945 7478 246098
1946 10470 256568
1947 14628 271196
1948 12237 283433
1949 13538 296971
1950 15276 312247
1951 14896 327143
1952 15543 342686
1953 19326 362012
exception
exception
1954 28963 390975
1955 33103 424078
exception
exception
exception
exception
exception
1956 24298 448376
1957 27856 476232
1958 31277 507509
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
exception
1959 28288 535797
1960 31926 567723
1961 29354 597077
exception
exception
exception
exception
exception
exception
excep

In [54]:
df = pd.DataFrame(entries)
df.head()

Unnamed: 0,authors,title,copydate
0,All’s well,The choice of the crowd,1922
1,American electrochemical society,Transactions of the American electrochemical s...,1923
2,American railway association. Mechanical divis...,Locomotive cyclopedia of American practice,1922
3,"Ames, William Homer|William Homer Ames, PH. B....","100 master speeches for the use of orators, st...",1922
4,Atkins. Gaius Glenn|Gaius Glenn Atkins,"The undiscovered country, and other addresses",1922


In [55]:
df.to_csv('/Users/tunder/work/copyright/tabularcopyright.tsv', sep = '\t', index = False)

In [80]:
df = pd.read_csv('/Users/tunder/work/copyright/tabularcopyright.tsv', sep = '\t', low_memory = False)

In [81]:
import pandas as pd
from difflib import SequenceMatcher
import re

In [82]:
titles = pd.read_csv('../noveltmmeta/metadata/titlemeta.tsv', sep = '\t', low_memory = False)
titles = titles.loc[titles.latestcomp > 1920, : ]
titles.shape

(75929, 30)

In [83]:
def getinitial(astring):
    if pd.isnull(astring):
        return('xx')
    astring = re.sub(r'[^\w\s]', '', astring.lower())
    if len(astring) < 1:
        return 'xx'
    elif len(astring) == 1:
        return astring.lower()
    else:
        return astring.lower()[0:2]

titles['initial'] = titles['author'].map(getinitial)
titles.head()

Unnamed: 0,docid,oldauthor,author,authordate,inferreddate,latestcomp,datetype,startdate,enddate,imprint,...,copiesin25yrs,enumcron,volnum,title,parttitle,earlyedition,shorttitle,nonficprob,juvenileprob,initial
6,mdp.39015059414725,"McKenna, Stephen","McKenna, Stephen",1888-1967.,,1967.0,n,,,London|Hutchinson & co.|n.d.,...,1.0,,,"Lady Lilith, | a novel: Beong the first part o...",,True,"Lady Lilith, a novel: Beong the first part of ...",0.165138,0.001126,mc
7,mdp.39015063920006,"Haggard, H. Rider (Henry Rider)","Haggard, H. Rider (Henry Rider)",1856-1925.,,1925.0,n,,,New York|J. S. Ogilvie|n.d.,...,1.0,,,Beatrice | [a novel] | $c: by H. Rider Haggard.,,True,Beatrice [a novel],0.070872,0.001023,ha
8,mdp.39015035876971,"Malet, Lucas","Malet, Lucas",1852-1931.,,1931.0,n,,,New York|T. Y. Crowell & co.|n.d.,...,1.0,,,Little Peter: | a Christmas morality for child...,,True,Little Peter: a Christmas morality for childre...,0.094639,0.512603,ma
13,mdp.39015063933132,"Lawrence, Charles Edward","Lawrence, Charles Edward",1870-1940.,,1940.0,n,,,New York|E. P. Dutton & con.|n.d.,...,1.0,,,"The god in the thicket, | $c: by C. E. Lawrence.",,True,The god in the thicket,0.066802,0.336583,la
21,mdp.39015063548997,"Glyn, Elinor","Glyn, Elinor",1864-1943.,,1943.0,n,,,New York|Boston|H. M. Caldwell co.|n.d.,...,1.0,,,Visits of Elizabeth | $c: [by] Elinor Glyn.,,True,Visits of Elizabeth,0.105119,0.220587,gl


In [84]:
def get_ratio(stringA, stringB):

    '''
    A generic function to get fuzzy similarity between two strings.
    '''

    m = SequenceMatcher(None, stringA, stringB)

    thefilter = m.real_quick_ratio()

    if thefilter < 0.75:
        return thefilter

    else:
        return m.ratio()

def title_compare(stringA, stringB):
    '''
    When searching for the similarity between two titles, we use this modified comparison,
    which gives a boost to the similarity when it applies to two long strings, but also
    permits relatively high similarity in cases where one string is much longer than
    the other and only the overlapping starts are similar. This is useful because there
    are often variant fiction titles like

    Shane
    and
    Shane: A Story of the American West.
    '''
    
    stringA = stringA.lower().replace('the ', 'x') # we replace 'the' because it's a word that
    stringB = stringB.lower().replace('the ', 'x') # is relatively long for how common it is,
                                                    # and also quite likely to appear at the start
                                                    # of a title, producing false matches

    minlen = min(len(stringA), len(stringB))
    maxlen = max(len(stringA), len(stringB))
    
    diffpenalty = (maxlen - minlen) / 300

    if minlen > 3:
        stringA = stringA[0: minlen]
        stringB = stringB[0: minlen]

    if minlen < 25:
        lendiscount = 1.2 - (((25 - minlen) ** 1.1) / 100)
    else:
        lendiscount = 1.2

    m = SequenceMatcher(None, stringA, stringB)

    thefilter = m.quick_ratio()
    
    if thefilter < 0.5:
        return round((thefilter * lendiscount) - diffpenalty, 4)

    else:
        return round((m.ratio() * lendiscount) - diffpenalty, 4)

def given_name_similarity(namesA, namesB):
    initialsA = set([x[0] for x in namesA])
    initialsB = set([x[0] for x in namesB])

    overlap = len(initialsA.intersection(initialsB))
    difference = len(initialsA.symmetric_difference(initialsB))

    surplus = overlap - difference

    if surplus < 1:
        return surplus * .04
    else:
        for name in namesA:
            if len(name) > 2 and name in namesB:
                surplus += 1

    return surplus * .04

In [91]:
matches = []
matchedalready = set()
ctr = 0

for year in range(1922, 1941):
    print('year', year)
    dfslice = df.loc[df.copydate == year, : ]
    titlesneardate = titles.loc[(titles.latestcomp > (year -3)) & (titles.latestcomp < (year + 30)), : ]
    blocks = dict()
    for idx, row in dfslice.iterrows():
        ctr += 1
        if ctr % 1000 == 1:
            print(ctr)

        title = row['title']

        if not pd.isnull(title):
            title = title.lower()
        else:
            continue
        title = re.sub(r'[^\w\s]', '', title)  # remove punctuation

        author = row['authors']

        if not pd.isnull(author):
            author = author.lower().split('|')[0]
        else:
            continue

        author = re.sub(r'[^\w\s]', '', author)

        names = author.split()

        if len(names) > 0:
            surname = names[0]
            last_initial = getinitial(surname)
        else:
            continue

        if len(names) > 1:
            given_names = names[1: ]
        else:
            given_names = []

        if last_initial in blocks:
            block = blocks[last_initial]
        else:
            block = titlesneardate.loc[titlesneardate['initial'] == last_initial, : ]
            blocks[last_initial] = block

        for idx, matchrow in block.iterrows():

            m_author = matchrow['author']

            if not pd.isnull(m_author):
                m_author = m_author.lower()
            else:
                continue

            m_author = re.sub(r'[^\w\s]', '', m_author)

            m_names = m_author.split()

            if len(m_names) > 0:
                m_surname = m_names[0]
            else:
                continue

            surname_match = get_ratio(surname, m_surname)

            if surname_match < .85:
                continue

            if len(m_names) > 1:
                m_given_names = m_names[1: ]
            else:
                m_given_names = []

            if len(m_given_names) > 0:
                given_supp = given_name_similarity(given_names, m_given_names)
            else:
                given_supp = 0

            m_title = matchrow['shorttitle']
            if pd.isnull(m_title):
                continue

            m_title = m_title.lower()
            m_title = re.sub(r'[^\w\s]', '', m_title)

            title_match = title_compare(title, m_title)

            if (surname_match + given_supp) > .98 and title_match > .95:

                matchrec = dict()
                matchrec['copy_author'] = author
                matchrec['copy_title'] = title
                matchrec['hathi_author'] = m_author
                matchrec['hathi_title'] = m_title
                matchrec['surname_match'] = surname_match
                matchrec['given_supp'] = given_supp
                matchrec['title_match'] = title_match
                matchrec['docid'] = matchrow['docid']
                matchrec['copy_date'] = row['copydate']
                matchrec['hathi_date'] = matchrow['latestcomp']
                
                matches.append(matchrec)
               
                matchlen = len(matches)
                if matchlen % 100 == 1:
                    print(len(matches), 'matches')
                matchedalready.add(matchrec['docid'])
    
    # titles = titles.loc[~titles.docid.isin(matchedalready)]
    print('Matched so far: ', len(matchedalready))
    
matched_df = pd.DataFrame(matches)
outdf = matched_df[['copy_author', 'copy_title', 'hathi_author', 'hathi_title', 'title_match', 'surname_match', 'given_supp', 'docid', 'copy_date', 'hathi_date']]
outdf.to_csv('copymatches_1922-40.tsv', sep= '\t', index = False)
    

year 1922
1
1 matches
1001
Matched so far:  25
year 1923
2001
101 matches
3001
201 matches
4001
5001
301 matches
6001
401 matches
7001
8001
9001
Matched so far:  377
year 1924
501 matches
10001
11001
601 matches
12001
13001
701 matches
14001
15001
801 matches
16001
17001
18001
Matched so far:  682
year 1925
19001
901 matches
20001
1001 matches
21001
22001
1101 matches
23001
24001
1201 matches
25001
26001
27001
1301 matches
28001
29001
Matched so far:  1011
year 1926
30001
1401 matches
31001
32001
1501 matches
33001
1601 matches
34001
35001
1701 matches
36001
37001
38001
39001
Matched so far:  1347
year 1927
1801 matches
40001
41001
1901 matches
42001
43001
44001
2001 matches
45001
46001
2101 matches
47001
2201 matches
48001
49001
50001
51001
52001
Matched so far:  1741
year 1928
2301 matches
53001
54001
2401 matches
55001
56001
57001
2501 matches
58001
59001
2601 matches
60001
61001
2701 matches
62001
63001
64001
65001
Matched so far:  2152
year 1929
66001
2801 matches
67001
2901 match

In [87]:
outdf = matched_df[['author', 'title', 'm_author', 'm_title', 'title_match', 'surname_match', 'given_supp', 'docid', 'copydate', 'hathidate']]

In [88]:
outdf.to_csv('copymatches_27-40.tsv', sep= '\t', index = False)

In [76]:
matched_df.head()

Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
0,ransome arthur,ransome,the soldier and death,the soldier and death a russian folk tale told...,[arthur],1.0,0.08,1.114963,inu.39000005845404,1922,1922.0
1,bramah ernest,bramah,kai lungs golden hours,kai lungs golden hours,[ernest],1.0,0.08,1.166516,uc1.$b312546,1922,1922.0
2,moss geoffrey,moss,sweet pepper,sweet pepper,[geoffrey],1.0,0.08,1.031989,uc1.$b302069,1922,1923.0
3,boddy elias manchester,boddy,the yellow trail a story of salmon river gold,the yellow trail,"[e, manchester]",1.0,0.12,1.046149,uc2.ark+=13960=t8ff3nf9p,1922,1922.0
4,sadleir michael,sadleir,desolate splendour,desolate splendour,[michael],1.0,0.08,1.114963,uc1.$b312441,1922,1923.0


In [79]:
from IPython.display import display

ctr = 0
for idx, df in matched_df.groupby('docid'):
    if df.shape[0] > 1:
        display(df)
        ctr += 1
    if ctr > 8:
        break

Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
1426,hall radclyffe,hall,adams breed,adams breed,[radclyffe],1.0,0.08,1.017719,inu.30000011829193,1925,1926.0
1573,hall radclyffe,hall,adams breed,adams breed,[radclyffe],1.0,0.08,1.017719,inu.30000011829193,1926,1926.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
5,stockley cynthia,stockley,ponjola a novel,ponjola,[cynthia],1.0,0.08,0.959675,inu.30000115733739,1922,1923.0
90,stockley cynthia,stockley,ponjola,ponjola,[cynthia],1.0,0.08,0.959675,inu.30000115733739,1923,1923.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
1527,haggard sir henry rider,haggard,the treasure of the lake,treasure of the lake,"[h, rider, henry, rider]",1.0,0.12,1.036714,inu.32000000663494,1926,1925.0
1579,haggard sir henry rider,haggard,treasure of the lake,treasure of the lake,"[h, rider, henry, rider]",1.0,0.12,1.101508,inu.32000000663494,1926,1925.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
421,bennett arnold,bennett,riceyman steps,riceyman steps and elsie and the child,[arnold],1.0,0.08,1.060192,inu.39000000300553,1923,1931.0
459,bennett arnold,bennett,riceyman steps,riceyman steps and elsie and the child,[arnold],1.0,0.08,1.060192,inu.39000000300553,1923,1931.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
49,montague charles edward,montague,fiery particles,fiery particles,"[c, e, charles, edward]",1.0,0.16,1.074107,mdp.39015000017817,1923,1923.0
1720,montague charles edward,montague,fiery particles,fiery particles,"[c, e, charles, edward]",1.0,0.16,1.074107,mdp.39015000017817,1926,1923.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
1507,chesterton gilbert keith,chesterton,the return of don quixote chap 7 contd9,the return of don quixote,"[g, k, gilbert, keith]",1.0,0.16,1.166516,mdp.39015000603939,1926,1927.0
1556,chesterton gilbert keith,chesterton,the return of don quixote chap 9 contd10,the return of don quixote,"[g, k, gilbert, keith]",1.0,0.16,1.166516,mdp.39015000603939,1926,1927.0
1625,chesterton gilbert keith,chesterton,the return of don quixote,the return of don quixote,"[g, k, gilbert, keith]",1.0,0.16,1.166516,mdp.39015000603939,1926,1927.0
1754,chesterton gilbert keith,chesterton,the return of don quixote,the return of don quixote,"[g, k, gilbert, keith]",1.0,0.16,1.166516,mdp.39015000603939,1926,1927.0
1837,chesterton gilbert keith,chesterton,the return of don quixote,the return of don quixote,"[g, k, gilbert, keith]",1.0,0.16,1.166516,mdp.39015000603939,1926,1927.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
1047,lawrence david herbert,lawrence,st mawr together with the princess,st mawr together with the princess,"[d, h, david, herbert]",1.0,0.16,1.2,mdp.39015000632292,1925,1925.0
1113,lawrence david herbert,lawrence,st mawr,st mawr together with the princess,"[d, h, david, herbert]",1.0,0.16,0.959675,mdp.39015000632292,1925,1925.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
1048,lawrence david herbert,lawrence,st mawr together with the princess,st mawr,"[d, h, david, herbert]",1.0,0.16,0.959675,mdp.39015000632300,1925,1925.0
1114,lawrence david herbert,lawrence,st mawr,st mawr,"[d, h, david, herbert]",1.0,0.16,0.959675,mdp.39015000632300,1925,1925.0


Unnamed: 0,author,m_surname,m_title,title,m_given_names,surname_match,given_supp,title_match,docid,copydate,hathidate
1419,milne alen alexander,milne,winniethepooh,winniethepooh,"[a, a, alan, alexander]",1.0,0.08,1.046149,mdp.39015002131749,1925,1933.0
1496,milne alan alexander,milne,winniethepooh goes visiting,winniethepooh,"[a, a, alan, alexander]",1.0,0.12,1.046149,mdp.39015002131749,1926,1933.0
1568,milne alan alexander,milne,winniethepooh finds a tail,winniethepooh,"[a, a, alan, alexander]",1.0,0.12,1.046149,mdp.39015002131749,1926,1933.0
1583,milne alan alexander,milne,winniethepooh goes hunting,winniethepooh,"[a, a, alan, alexander]",1.0,0.12,1.046149,mdp.39015002131749,1926,1933.0
1776,milne alan alexander,milne,winniethepooh,winniethepooh,"[a, a, alan, alexander]",1.0,0.12,1.046149,mdp.39015002131749,1926,1933.0
1902,milne alan alexander,milne,winniethepooh,winniethepooh,"[a, a, alan, alexander]",1.0,0.12,1.046149,mdp.39015002131749,1926,1933.0
