# Propagate fix to copies columns

Having improved the measure of copies in titlemeta.tsv, we now need to propagate that improvement outward to weighted_subset and manual_title_subset.

Unfortunately, it's too late to reselect weighted_subset using the improved measure. We'll just have to live with the slight imperfection, which mostly affects the 19c.

In [1]:
import pandas as pd

In [2]:
translator = dict()
with open('../data/filename_translator.tsv', encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        if fields[0] == 'badname':
            continue
        else:
            translator[fields[0]] = fields[1]

In [3]:
volume = pd.read_csv('../masterficmetadata.tsv', sep = '\t', low_memory = False)

In [4]:
def fixdocids(df):
    trans = 0
    cleaned = 0
    
    for idx in df.index:
        docid = df.loc[idx, 'docid']
        if docid in translator:
            df.loc[idx, 'docid'] = translator[docid]
            trans += 1
        elif ':/' in docid:
            newdoc = docid.replace(':', '+')
            newdoc = newdoc.replace('/', '=')
            df.loc[idx, 'docid'] = newdoc
            cleaned +=1
    
    print('Translated: ', trans)
    print('Cleaned: ', cleaned)        

In [5]:
fixdocids(volume)

Translated:  1180
Cleaned:  658


In [6]:
record = pd.read_csv('../recordmeta.tsv', sep = '\t', low_memory = False)

In [7]:
fixdocids(record)

Translated:  1180
Cleaned:  0


In [8]:
title = pd.read_csv('../titlemeta.tsv', sep = '\t', low_memory = False)

In [9]:
fixdocids(title)

Translated:  940
Cleaned:  388


In [10]:
weighted = pd.read_csv('../manuallists/weighted_subset.tsv', sep = '\t', low_memory = False)

In [11]:
fixdocids(weighted)

Translated:  33
Cleaned:  0


In [12]:
manual = pd.read_csv('../manuallists/manual_title_subset.tsv', sep = '\t', low_memory = False)

In [13]:
fixdocids(manual)

Translated:  28
Cleaned:  0


In [14]:
weighted.set_index('docid', inplace = True)

In [15]:
title.set_index('docid', inplace = True)
manual.set_index('docid', inplace = True)

In [16]:
err = 0
for docid in weighted.index:
    if docid in title.index:
        cop = title.loc[docid, 'allcopiesofwork']
        cop25 = title.loc[docid, 'copiesin25yrs']
        weighted.loc[docid, 'allcopiesofwork'] = cop
        weighted.loc[docid, 'copiesin25yrs'] = cop25
    else:
        print('error', docid)
        err += 1
        

In [17]:
err

0

In [18]:
err = 0
for docid in manual.index:
    if docid in title.index:
        cop = title.loc[docid, 'allcopiesofwork']
        cop25 = title.loc[docid, 'copiesin25yrs']
        manual.loc[docid, 'allcopiesofwork'] = cop
        manual.loc[docid, 'copiesin25yrs'] = cop25
    else:
        print('error', docid)
        err += 1
err

0

In [19]:
manual.to_csv('../manuallists/new_manual_title_subset.tsv', sep = '\t', index_label = 'docid')

In [20]:
weighted.to_csv('../manuallists/weighted_subset.tsv', sep = '\t', index_label = 'docid')

In [21]:
title.to_csv('../titlemeta.tsv', sep = '\t', index_label = 'docid')

In [22]:
volume.set_index('docid', inplace = True)

In [23]:
volume.to_csv('../masterficmetadata.tsv', sep = '\t', index_label = 'docid')

In [24]:
record.set_index('docid', inplace = True)
record.to_csv('../recordmeta.tsv', sep = '\t', index_label = 'docid')

In [25]:
gender = pd.read_csv('../manuallists/gender_balanced_subset.tsv', sep = '\t', low_memory = False)
fixdocids(gender)

Translated:  14
Cleaned:  0


In [27]:
gender.set_index('docid', inplace = True)

In [28]:
err = 0
for docid in gender.index:
    if docid in title.index:
        cop = title.loc[docid, 'allcopiesofwork']
        cop25 = title.loc[docid, 'copiesin25yrs']
        gender.loc[docid, 'allcopiesofwork'] = cop
        gender.loc[docid, 'copiesin25yrs'] = cop25
    else:
        print('error', docid)
        err += 1
err

0

In [29]:
popular = pd.read_csv('../reportcode/most_popular_subset.tsv', sep = '\t', low_memory = False)
fixdocids(popular)

Translated:  31
Cleaned:  3


In [30]:
popular.set_index('docid', inplace = True)

In [31]:
err = 0
for docid in popular.index:
    if docid in title.index:
        cop = title.loc[docid, 'allcopiesofwork']
        cop25 = title.loc[docid, 'copiesin25yrs']
        popular.loc[docid, 'allcopiesofwork'] = cop
        popular.loc[docid, 'copiesin25yrs'] = cop25
    else:
        print('error', docid)
        err += 1
err

0

In [32]:
gender.to_csv('../manuallists/gender_balanced_subset.tsv', sep = '\t', index_label = 'docid')

In [33]:
popular.to_csv('../manuallists/frequently_reprinted_subset.tsv', sep = '\t', index_label = 'docid')