# Fuse manycopies data

This unites the three "manycopies" files, which then will be used to supplement our existing data to produce a reprint-proportional sample.

In [1]:
import pandas as pd

In [2]:
jmc = pd.read_csv('../copies/jessicamanycopies.csv', index_col = 'docid')
tmc = pd.read_csv('../copies/tedmanycopies.csv', index_col = 'docid')
pmc = pd.read_csv('../copies/patrickmanycopies.csv', index_col = 'docid')

In [4]:
columns = tmc.columns.tolist()
columns

['author',
 'realname',
 'pseudonym',
 'gender',
 'nationality',
 'authordate',
 'inferreddate',
 'firstpub',
 'latestcomp',
 'allcopiesofwork',
 'copiesin25yrs',
 'enumcron',
 'imprint',
 'genres',
 'subjects',
 'category',
 'shorttitle']

In [17]:
def reconcile_rows(existingrow, newrow):
    ''' Accepts two rows as dicts and reconciles them
    to produce a consensus. Generally the rule
    '''
    consensus = dict()
    
    takelonger = ['realname', 'pseudonym', 'nationality', 'gender',
                  'authordate', 'enumcron', 'imprint', 'genres',
                 'subjects', 'shorttitle', 'latestcomp']
    shouldagree = ['author', 'copiesin25yrs', 'allcopiesofwork', 
                   'inferreddate']
    
    takelonger.extend(shouldagree)
    
    # The columns 'firstpub' and 'category' are special cases,
    # governed by separate rules
    
    for col in takelonger:
        if pd.isnull(existingrow[col]): 
            existinglen = 0
        else:
            existinglen = len(str(existingrow[col]))
            
        if pd.isnull(newrow[col]):
            newlen = 0
        else:
            newlen = len(str(newrow[col]))
        
        if newlen > existinglen:
            consensus[col] = newrow[col]
        else:
            # notice that in cases of a tie, the
            # existing value remains
            consensus[col] = existingrow[col]
    
        if existingrow[col] != newrow[col] and col in shouldagree:
            print('Disagreement: ', col, existingrow[col], newrow[col])
    
    if pd.isnull(existingrow['firstpub']):
        existfirst = 3000
    else:
        existfirst = int(existingrow['firstpub'])
        
    if pd.isnull(newrow['firstpub']):
        newfirst = 3000
    else:
        newfirst = int(newrow['firstpub'])
    
    if newfirst < existfirst:
        consensus['firstpub'] = newfirst
    elif existfirst > 2050:
        consensus['firstpub'] = float('nan')
    else:
        consensus['firstpub'] = existfirst     
    
    if not pd.isnull(consensus['firstpub']):
        firstpub = int(consensus['firstpub'])
        if firstpub < int(consensus['latestcomp']):
            consensus['latestcomp'] == firstpub
        
        # we take the earliest date as latestcomp;
        # we leave nulls in the firstpub row because they
        # may indicate cases where there was no new
        # information discovered; this may affect our certainty
    
    if existingrow['category'] == newrow['category']:
        consensus['category'] = existingrow['category']
        
    # before 1890 we are pretty skeptical about short stories
    
    elif int(consensus['latestcomp']) < 1890 and newrow['category'] == 'shortstories':
        consensus['category'] = existingrow['category']
    
    elif int(consensus['latestcomp']) < 1890 and existingrow['category'] == 'shortstories':
        consensus['category'] = newrow['category']
    
    # if people say this is a reprint, we believe them
    elif newrow['category'] == 'reprint':
        consensus['category'] = 'reprint'
        
    else:
        consensus['category'] = existingrow['category']
        # be default we trust the existing testimony
        
    return consensus
    

In [18]:
sequence = [tmc, jmc, pmc]

exist = dict()

def get_dict(row):
    global columns
    adict = dict()
    for col in columns:
        adict[col] = row[col]
    
    if pd.isnull(adict['firstpub']):
        first = 3000
    else:
        first = int(adict['firstpub'])
    
    if first < int(adict['latestcomp']):
        adict['latestcomp'] = first
        
    return adict
        

for df in sequence:
    for idx, row in df.iterrows():
        if idx not in exist:
            exist[idx] = get_dict(row)
        else:
            exist_row = exist[idx]
            new_row = get_dict(row)
            consensus = reconcile_rows(exist_row, new_row)
            exist[idx] = consensus

print(len(exist))

Disagreement:  author nan nan
Disagreement:  author Hill, Herbert, ed. Hill, Herbert
Disagreement:  author nan nan
Disagreement:  author nan nan
482


In [19]:
df = pd.DataFrame.from_dict(exist, orient = 'index')
df.head()

Unnamed: 0,enumcron,subjects,shorttitle,authordate,inferreddate,latestcomp,nationality,copiesin25yrs,author,category,allcopiesofwork,gender,pseudonym,genres,realname,firstpub,imprint
coo.31924022000370,v.2,,Tower of ivory; a novel,1857-1948.,1910,1910,us,7,"Atherton, Gertrude Franklin Horn",novel,7,f,,Fiction|NotFiction,,,Leipzig;B. Tauchnitz;1910.
coo.31924060446899,v.2,,Temptation,1860-1921.,1907,1907,uk,2,"Bagot, Richard",novel,2,m,,NotFiction,,,Leipzig;B. Tauchnitz;1907.
coo.31924064975323,v.2,,"The extermination of love, a fragmentary study...",1849-1905.,1901,1901,,2,"Gerard, E. (Emily)",nonfic,2,,,NotFiction,,,Leipzig;B. Tauchnitz;1901.
coo.31924065002358,v.2,823Y8TW,The two sides of the shield,1823-1901.,1885,1885,uk,7,"Yonge, Charlotte M. (Charlotte Mary)",novel,7,f,,Fiction|NotFiction,"Yonge, Charlotte Mary",,Leipzig;B. Tauchnitz;1885.
dul1.ark+=13960=t2b85xw79,v.1,Utopias,Our own Pompeii,1856-1941.,1887,1887,uk,2,"Fox, S. M",novel,2,m,,,,,Edinburgh;W. Blackwood and Sons;1887.


In [20]:
reconciled = df[columns]
reconciled.sort_values(by = 'latestcomp', inplace = True)

In [13]:
ls

compare_manycopies.ipynb  fuse_manycopies.ipynb


In [21]:
reconciled.to_csv('manycopies_reconciled.tsv', sep = '\t', index_label = 'docid')

In [22]:
sum(reconciled.category == 'novel')

392