# Selecting LoC-genre-tagged volumes

This notebook creates metadata for genrexp2 by selecting volumes that match predefined Library-of-Congress genre categories. For the most part it does that in a way matching their prevalence in Hathi. It also selects a set of completely random volumes, that may overlap with genre-tagged volumes but will also include untagged volumes.

#### First we load the categories we're going to be using. These differ from the last experiment in lacking the too-general category "novel."

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import random, csv

dateceilings = list(range(1870, 2020, 10))

meta = pd.read_csv('/Users/tunder/Dropbox/python/noveltmmeta/metadata/titlemeta.tsv', sep = '\t', low_memory = False)

categories = dict()

with open('genre_groups_for_20c_exp.tsv', encoding = 'utf-8') as f:
    reader = csv.reader(f, delimiter = '\t')
    for fields in reader:
        categories[fields[0]] = set(fields)

allcats = list(categories.keys())
print(allcats)

['tales', 'domestic fiction', 'war stories', 'sea stories', 'ghost stories', 'western stories', 'adventure stories', 'psychological fiction', 'science fiction', 'autobiography', 'love stories', 'imaginary voyages', 'mystery and detective fiction', 'humor', 'short stories', 'folklore', 'legends', 'historical fiction', 'fairy tales']


**Now we iterate through the metadata and identify genre categories for each volume, while also keeping a list of all genre-tagged volumes in each decade, and all vols of any kind in each decade.**

In [2]:
volcategories = dict()  # genre categories for each volume
genretaggedinperiod = dict() # period ceilings are keys, the values are sets
                             # of vols in that period with genre tags
allinperiod = dict()   # period ceilings are keys, the values
                       # are sets of all vols in that period
byperiodandgenre = dict() # a dict of dicts where the leaves contain a set of vols
                            # in that period and genre

# initialize the sets
for d in dateceilings:
    genretaggedinperiod[d] = set()
    allinperiod[d] = set()
    byperiodandgenre[d] = dict()
    for c in allcats:
        byperiodandgenre[d][c] = set()
    
# some books we already know are too short for our purposes

tooshort = set()
with open('booksthataretooshort.txt', encoding = 'utf-8') as f:
    for line in f:
        tooshort.add(line.strip())
    
ctr = 0

for idx, row in meta.iterrows():
    ctr += 1
    if ctr % 10000 == 1:
        print(ctr)
    
    docid = row.docid
    if docid in tooshort:
        continue
        
    if not pd.isnull(row['subjects']):
        subjects = row['subjects'].split('|')
    else:
        subjects = list()

    if not pd.isnull(row['genres']):
        genres = row['genres'].split('|')
    else:
        genres = list()

    if not pd.isnull(row['latestcomp']):
        date = int(row['latestcomp'])
    else:
        date = 0

    if date < 1860:
        continue
    for d in dateceilings:
        if date < d:
            ceiling = d
            break

    allinperiod[d].add(idx)
    volcategories[idx] = set()

    for cat, catset in categories.items():
        
        match = False
        for g in genres:
            if g.lower() in catset:
                match = True
                break
        for s in subjects:
            if s.lower() in catset:
                match = True
                break
        if match:
            volcategories[idx].add(cat)
            genretaggedinperiod[d].add(idx)
            byperiodandgenre[d][cat].add(idx)

1
10001
20001
30001
40001
50001
60001
70001
80001
90001
100001
110001
120001
130001


**Now we proceed to actually select volumes.**

In [4]:
# the crucial set, containing volumes we have selected

allselected = set()

# the next two variables are just counters
# for the numbers of volumes selected in a given
# genre in a decade

genreselected = dict()
onlygenre = dict()

supplementctr = 0

for d in dateceilings:
    genreselected[d] = Counter()
    onlygenre[d] = Counter()
    
    k = len(genretaggedinperiod[d])
    if k > 400:
        k = 400
    
    genreselect = random.sample(genretaggedinperiod[d], k)
    print('genre', d, k)
    
    randomselect = random.sample(allinperiod[d], 75)
    
    for avol in genreselect:
        allselected.add(avol)
        for cat in volcategories[avol]:
            genreselected[d][cat] += 1
            if len(volcategories[avol]) < 2:
                onlygenre[d][cat] += 1
    
    for avol in randomselect:
        allselected.add(avol)
        volcategories[avol].add('random')
  
    # we ensure there are at least 3 volumes in each genre in each decade
    
    for c in allcats:
        if genreselected[d][c] < 3:
            possibles = byperiodandgenre[d][c]
            stillpossibles = possibles - allselected
            toget = 3 - genreselected[d][c]
            if len(stillpossibles) < toget:
                toget = len(stillpossibles)
            
            print(d, c, toget)
            minimum = random.sample(stillpossibles, toget)
            for m in minimum:
                allselected.add(m)
                genreselected[d][c] += 1
                supplementctr += 1
            
                
print("We have", len(allselected), "volumes.")
print("Of those", supplementctr, "were supplements to meet at least 3 per genre per decade.")          

genre 1870 356
1870 ghost stories 0
genre 1880 400
1880 ghost stories 1
1880 psychological fiction 0
genre 1890 400
genre 1900 400
1900 domestic fiction 1
1900 ghost stories 1
1900 imaginary voyages 1
genre 1910 400
1910 ghost stories 1
1910 imaginary voyages 3
genre 1920 400
1920 imaginary voyages 3
genre 1930 400
1930 imaginary voyages 2
genre 1940 400
1940 ghost stories 1
1940 psychological fiction 1
1940 imaginary voyages 0
genre 1950 400
1950 ghost stories 1
1950 imaginary voyages 0
genre 1960 400
1960 ghost stories 3
1960 imaginary voyages 0
genre 1970 400
1970 domestic fiction 2
1970 ghost stories 1
1970 adventure stories 2
1970 imaginary voyages 0
genre 1980 400
1980 sea stories 1
1980 ghost stories 1
1980 adventure stories 1
1980 imaginary voyages 0
genre 1990 400
1990 sea stories 2
1990 adventure stories 2
1990 imaginary voyages 0
genre 2000 400
2000 sea stories 2
2000 autobiography 1
2000 imaginary voyages 0
genre 2010 400
2010 ghost stories 1
2010 autobiography 3
2010 imagi

In [5]:
# just confirming that we didn't select any books that were too short

print(len(allselected))
print(len(allselected - tooshort))


7081
7081


In [7]:
meta = meta.replace(np.nan, '', regex=True)

with open('genre_assignments_4loc2.tsv', mode = 'w', encoding = 'utf-8') as f:
    f.write('docid\texp_genres\tdate\tauthor\ttitle\toriggenres\torigsubjs\tremove\n')
    for idx in allselected:
        docid = meta.at[idx, 'docid']
        experimentgenres = '|'.join(volcategories[idx])
        date = meta.at[idx, 'latestcomp']
        author = meta.at[idx, 'author']
        title = meta.at[idx, 'title']
        origgenres = meta.at[idx, 'genres']
        origsubjs = meta.at[idx, 'subjects']
        f.write(docid + '\t' + experimentgenres + '\t' + str(date) + '\t' + author +
                '\t' + title + '\t' + origgenres + '\t' + origsubjs + '\t \n')
        

In [13]:
with open('selectedtable.tsv', mode = 'w', encoding = 'utf-8') as f:
    header = ['genrecategory'] + [str(x - 10) for x in dateceilings]
    header = '\t'.join(header) + '\n'
    f.write(header)
    for c in allcats:
        line = [c]
        for d in dateceilings:
            if c in genreselected[d]:
                ct = genreselected[d][c]
            else:
                ct = 0
            line.append(str(ct))
            
        f.write('\t'.join(line) + '\n')

In [9]:
len(randomselect) * len(dateceilings)


1125