# Genre propagation

Genres get assigned to MARC records quite late in the game (it's not really common till the 1990s). So in restricting our dataset to the earliest available example of a book, we lose a lot of genre info.

This notebook aims to restore it.

In [14]:
import pandas as pd
import numpy as np
import csv

In [2]:
volmap = dict()
with open('allgroups.tsv', encoding = 'utf-8') as f:
    for line in f:
        vols = line.strip().split('\t')
        volset = set(vols)
        for v in vols:
            if v not in volmap:
                volmap[v] = set(volset)
            else:
                for v1 in vols:
                    volmap[v].add(v1)


In [6]:
work = pd.read_csv('../workmeta.tsv', sep = '\t', low_memory = False, index_col = 'docid')
manifest = pd.read_csv('../manifestationmeta.tsv', sep = '\t', low_memory = False, index_col = 'docid')

In [18]:
genres = dict()
subjects = dict()

increased = []

with open('../manifestationmeta.tsv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f, delimiter = '\t')
    for row in reader:
        docid = row['docid']
        s = row['subjects'].replace(';', '|')
        g = row['genres'].replace(';', '|')
        subjects[docid] = set(s.split('|'))
        genres[docid] = set(g.split('|'))

print('loaded')

allrows = []

with open('../workmeta.tsv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f, delimiter = '\t')
    fieldnames = reader.fieldnames
    for row in reader:
        docid = row['docid']
        s = row['subjects'].replace(';', '|')
        g = row['genres'].replace(';', '|')
        s = set(s.split('|'))
        g = set(g.split('|'))
        glen = len(g)
        if docid in volmap:
            for d in volmap[docid]:
                s = s.union(subjects[d])
                g = g.union(genres[d])
        
        increase = len(g) - glen
        increased.append(increase)
        
        g = [x.strip() for x in g if len(x.strip()) > 1]
        s = [x.strip() for x in s if len(x.strip()) > 1]
        
        row['genres'] = '|'.join(g)
        row['subjects'] = '|'.join(s)
        allrows.append(row)

fieldnames.pop(fieldnames.index('shorttitle'))
fieldnames.append('shorttitle')
# make title last

with open('../enrichedworkmeta.tsv', mode = 'w', encoding = 'utf-8') as f:
    writer = csv.DictWriter(f, fieldnames = fieldnames, delimiter = '\t')
    writer.writeheader()
    for row in allrows:
        writer.writerow(row)
        
            
increased = np.array(increased)
print(np.mean(increased))


loaded
0.132679875775


In [12]:
sum(increased) / len(increased)

0.13267987577549822

In [None]:
print(work.columns.tolist())