# Make richer reception metadata

This pairs our matched books with reception metadata in a way that allows richer analysis.

Note this version is different than and better than the one in asym300.

In [1]:
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz

In [2]:
match = pd.read_csv('meta_todescribe.tsv', sep = '\t')

In [3]:
match.loc[match.docid.duplicated(keep = False), :]

Unnamed: 0,docid,author,title,latestcomp,in_topic_model,hathi_author,hathi_title,topicmodel_firstpub,birthyear,us_national,...,rownum,is_bestseller,bestseller_year,is_prize_author,is_retroavant,retrospective_firstpub,consensus_earliest_date,author_age,standardized_name,review_comparison


In [4]:
tokeep = []
already = set()
for idx in match.docid:
    if idx in already:
        tokeep.append(False)
    else:
        tokeep.append(True)
    already.add(idx)

In [5]:
print(match.shape)
match = match.loc[tokeep, :]
match.shape

(10339, 23)


(10339, 23)

In [6]:
def get_path(year):
    root = '/Users/tunder/Dropbox/python/avant/'
    if year < 1931:
        path = root + '1916-30revs/fiction' + str(year - 1904) + '_' + str(year) + '.tsv'
    elif year < 1941:
        path = root + '1930srevs/nonconsumptive' + str(year - 1904) + '_' + str(year) + '.tsv'
    else:
        path = root + '1940srevs/nonconsumptive' + str(year - 1904) + '_' + str(year) + '.tsv'
        
    return path

In [7]:
path = get_path(1935)
path

'/Users/tunder/Dropbox/python/avant/1930srevs/nonconsumptive31_1935.tsv'

In [8]:
test_df = pd.read_csv(path, sep = '\t', low_memory = False)
test_df.head()

Unnamed: 0,bookauthor,booktitle,brdpage,price,publisher,publication,citation,quote,avgsentiment,avgsentwmissing,bookindex,numreviewswithsent,numreviewsofbk,authtitlefromindex,matchcloseness
0,"TRACY, DON.",Criss-cross.,996,2.0,247p Vanguard,summary,summary,a a a an an and and and Anna Anna armored as b...,4.0,3.790375,0,4,7,tracy + d + criss-cross,1.83007
1,"TRACY, DON.",Criss-cross.,996,2.0,247p Vanguard,Books,p8 Ja 6 '35 160w,action and authentic Bell crook Don double-cro...,4.0,3.790375,0,4,7,tracy + d + criss-cross,1.83007
2,"TRACY, DON.",Criss-cross.,996,2.0,247p Vanguard,New Statesman & Nation,10:18 Jl 6 '35 120w,a a all Always American and and any are at bad...,4.0,3.790375,0,4,7,tracy + d + criss-cross,1.83007
3,"TRACY, DON.",Criss-cross.,996,2.0,247p Vanguard,N Y Times,p!7 Ja 6 '35 310w,"$10,000 998 a a a action and and and Beckwith ...",4.0,3.790375,0,4,7,tracy + d + criss-cross,1.83007
4,"TRACY, DON.",Criss-cross.,996,2.0,247p Vanguard,Sat R of Lit,11:436 Ja 19 '35 30w,Good tlmeklller,4.0,3.790375,0,4,7,tracy + d + criss-cross,1.83007


In [13]:
match.head()

Unnamed: 0,docid,author,title,latestcomp,in_topic_model,hathi_author,hathi_title,topicmodel_firstpub,birthyear,us_national,...,rownum,is_bestseller,bestseller_year,is_prize_author,is_retroavant,retrospective_firstpub,consensus_earliest_date,author_age,standardized_name,review_comparison
0,mdp.39015059414725,"McKenna, Stephen","Lady Lilith, | a novel: Beong the first part o...",1967.0,False,,,,,,...,895.0,False,,False,False,,1920.0,,"McKenna, Stephen",not_in_tm
1,mdp.39015063548997,"Glyn, Elinor",Visits of Elizabeth | $c: [by] Elinor Glyn.,1943.0,True,"Glyn, Elinor",Visits of Elizabeth,1943.0,1864.0,False,...,,True,1901.0,False,False,,1901.0,37.0,"Glyn, Elinor",here_for_another_reason
2,wu.89099437782,"France, Anatole",The authorized English translations of the nov...,1924.0,False,,,,,,...,,False,,True,False,,1924.0,,"France, Anatole",not_in_tm
3,nyp.33433076075344,"Lynd, Sylvia",The chorus : | a tale of love and folly / | $c...,1952.0,False,,,,,,...,819.0,False,,False,False,,1916.0,,"Lynd, Sylvia",not_in_tm
4,uc1.$b797656,"Rinehart, Mary Roberts","The romantics, | $c: by Mary Roberts Rinehart.",1958.0,True,"Rinehart, Mary Roberts",The romantics,1929.0,1876.0,True,...,,False,,False,False,,1929.0,53.0,"Rinehart, Mary Rober",unreviewed_tm_sample


In [14]:
match.columns

Index(['docid', 'author', 'title', 'latestcomp', 'in_topic_model',
       'hathi_author', 'hathi_title', 'topicmodel_firstpub', 'birthyear',
       'us_national', 'authof3ormore', 'in_BRD', 'brd_review_year', 'rownum',
       'is_bestseller', 'bestseller_year', 'is_prize_author', 'is_retroavant',
       'retrospective_firstpub', 'consensus_earliest_date', 'author_age',
       'standardized_name', 'review_comparison'],
      dtype='object')

In [9]:
from collections import Counter
top_venues = Counter()

### Gather list of top publications

We iterate through possible years and count occurrences of publication titles. We'll keep the top 30.

In [19]:
for year, df in match.groupby('brd_review_year'):
    try:
        year = int(year)
    except:
        print("ERROR:", year)
        continue
    
    path = get_path(year)
    reviews = pd.read_csv(path, sep = '\t', low_memory = False)
    # We use the dictionary "misspellings" to translate publication names
    reviews['publication'] = reviews['publication'].replace(mispellings) # initially comment this out
    venues = Counter(reviews['publication'])
    top_venues.update(venues)

In [20]:
top_venues.most_common(50)

[('N Y Times', 38364),
 ('Sat R of Lit', 30246),
 ('summary', 27990),
 ('Booklist', 26049),
 ('Boston Transcript', 23214),
 ('Books', 20394),
 ('Times [London] Lit Sup', 13995),
 ('New Yorker', 11358),
 ('Nation', 11286),
 ('New Repub', 10494),
 ("Springf'd Republican", 10296),
 ('Spec', 9519),
 ('Wis Lib Bui', 9151),
 ('Library J', 9144),
 ('Outlook', 7805),
 ('San Francisco Chronicle', 6125),
 ('NY Times', 5640),
 ('Weekly Book Review', 5199),
 (nan, 5094),
 ('Lit R', 4902),
 ('Cleveland Open Shelf', 4788),
 ('Time', 4776),
 ('Klrkus', 4604),
 ('Kirkus', 4556),
 ('Bookm', 4359),
 ('Pratt', 4263),
 ('Sat R', 4218),
 ('Commonweal', 3969),
 ('Cath World', 3900),
 ('Christian Science Monitor', 3887),
 ('Book Week', 3882),
 ('Manchester Guardian', 3780),
 ("Sprlngf'd Republican", 3622),
 ('N Y Herald Tribune Wkly Bk R', 3027),
 ('Books (N Y Herald Tribune)', 3016),
 ('Cleveland', 2659),
 ('Ind', 2625),
 ('N Y World', 2556),
 ('Christian Century', 2556),
 ('N Y Evening Post', 2553),
 ('N Y

In [18]:
# create a map of misspellings

venuetitles = [x[0] for x in top_venues.most_common(500)]
corrected = dict()
mispellings = dict()
corrected['N Y Times'] = 'NY Times'
corrected["Springf'd Republican"] = "Sprlngf'd Republican"
corrected['Klrkus'] = 'Kirkus'
for k, v in corrected.items():
	mispellings[v] = k

for i in range(500):
	title_a = venuetitles[i]
	if pd.isnull(title_a):
		continue
	for j in range(i+1, 500):
		title_b = venuetitles[j]
		if pd.isnull(title_b):
			continue
		misspelled = False
		if fuzz.ratio(title_a, title_b) > 85:
			misspelled = True
		elif len(title_a) > 4 and title_a in title_b:
			misspelled = True
		elif title_b in mispellings:
			misspelled = True
		if misspelled:
			if title_a not in corrected and title_b not in corrected:
				mispellings[title_b] = title_a
				corrected[title_a] = title_b
				print(title_b, "is a misspelling of", title_a, top_venues[title_b])
			elif title_a in corrected:
				mispellings[title_b] = title_a
				corrected[title_a] = title_b
				print(title_b, "is a misspelling of", title_a, top_venues[title_b])
			elif title_b in corrected:
				mispellings[title_a] = title_b
				corrected[title_b] = title_a
				print(title_a, "is a misspelling of", title_b, top_venues[title_a])


NY Times is a misspelling of N Y Times 5640
Kirkus is a misspelling of N Y Times 4556
Sprlngf'd Republican is a misspelling of N Y Times 3622
N Y Time* is a misspelling of N Y Times 862
h N Y Times is a misspelling of N Y Times 840
N Y Tlmes is a misspelling of N Y Times 138
(- N Y Times is a misspelling of N Y Times 128
N Y Times plO Jl is a misspelling of N Y Times 88
N Y Timet is a misspelling of N Y Times 84
N Y Times plO S is a misspelling of N Y Times 82
N Y Times plO O is a misspelling of N Y Times 80
N Y Timei is a misspelling of N Y Times 78
N Y Times plO Je is a misspelling of N Y Times 74
N Y Times plO My is a misspelling of N Y Times 66
r- N Y Times is a misspelling of N Y Times 64
N Y Times plO D is a misspelling of N Y Times 64
-NY Times is a misspelling of N Y Times 62
N Y Times plO Mr is a misspelling of N Y Times 56
N Y Times plO Ag is a misspelling of N Y Times 56
N Y Times plO Ap is a misspelling of N Y Times 56
N Y Times plO N is a misspelling of N Y Times 54
f- N Y

In [21]:
dontkeep = {'summary', float('nan')}
venues_kept = [x[0] for x in top_venues.most_common(32) if x[0] not in dontkeep and not pd.isnull(x[0])]
print(len(venues_kept))

30


In [22]:
def get_wordcount(citation, themedian):
    if pd.isnull(citation):
        return themedian
    token = citation.split()[-1].lower()
    if token.endswith('w'):
        token = token.replace('o', '0')
        token = token.replace('l', '1')
        token = token.replace('w', '')
        token = token.replace('s', '5')
        token = token.replace('i', '1')
        token = token.replace('b', '8')
        try:
            count = int(token)
        except:
            count = themedian
        return count
    else:
        return themedian
        

In [25]:
docdict = dict()


for year, df in match.groupby('brd_review_year'):
    try:
        year = int(year)
    except:
        print("ERROR:", year)
        continue
    
    venue_medians = dict()
    
    path = get_path(year)
    reviews = pd.read_csv(path, sep = '\t', low_memory = False)
    reviews['publication'] = reviews['publication'].replace(mispellings)
    allcounts = []
    for idx, row in reviews.iterrows():
        cite = row['citation']
        pub = row['publication']
        wordcount = get_wordcount(cite, 0)
        if wordcount > 0:
            allcounts.append(wordcount)
            if pub in venues_kept:
                if pub not in venue_medians:
                    venue_medians[pub] = []
                venue_medians[pub].append(wordcount)
    themedian = np.median(allcounts)
    print(year, themedian)
    for k, v in venue_medians.items():
        venue_medians[k] = np.median(v)
    
    for idx, row in df.iterrows():
        rownum = int(row['rownum'])
        docid = row['docid']
        reviewrow = reviews.iloc[rownum, :]
        revtitle = reviewrow['booktitle'].lower()
        title = row['title']
        ratio = fuzz.ratio(title, revtitle)
        if ratio < 80:
            print(title, revtitle)
        thebookindex = reviewrow['bookindex']
        allreviews = reviews.loc[reviews.bookindex == thebookindex, :]
        if year < 1931:
            sentiment = reviewrow['avgsentiment']
        else:
            sentiment = reviewrow['avgsentwmissing']
        
        wordcount = 0
        venuedict = dict()
        for v in venues_kept:
            venuedict[v] = 0

        for idx, row in allreviews.iterrows():
            cite = row['citation']
            pub = row['publication']
            if pub in mispellings:
                pub = mispellings[pub]
            words = get_wordcount(cite, themedian)
            if words < 5000:
                wordcount += words
            else:
                wordcount += themedian
            if pub in venues_kept:
                if pub in venue_medians:
                    this_median = venue_medians[pub]
                else:
                    this_median = themedian
                venue_words = get_wordcount(cite, this_median)
                venuedict[pub] += venue_words

        docdict[docid] = (sentiment, wordcount, venuedict)      
    
    

1916 280.0
The chorus : | a tale of love and folly / | $c: by Sylvia Lynd. chorus; a tale of love and folly.
Baldy of Nome; | an immortal of the trail, | $c: by Esther Birdsall Darling ... baldy of nome, il
The golden woman : | the story of a western mining camp / | $c: by Ridgwell Cullum. golden woman.
The Prussian officer, and other stories, | $c: by D.H. Lawrence. prussian of- ficer, and other stories.
Troubled Tranton, | $c: by W. E. Norris. troubled tranton.
Beggars on horseback / | $c: by F. Tennyson Jesse. beggars on horseback, il «*1.25 (h4c) doran
My friend Phil my friend phil, il «j1.25 (2c) kand 15-23061
The chorus chorus; a tale of love and folly.
A baby of the frontier, | $c: by Cyrus Townsend Brady ... baby of the frontier, il
The imprisoned splendor / | $c: by Angela Morgan ... imprisoned splendor.
Blind sight / | $c: by B.Y. Benediall ; frontispiece by George Gibbs. blind sight.
The furnace of iron, | $c: by Andrew Firth ... furnace of iron.
The little lady of the big h

In [26]:
docdict['mdp.39015005776847']

(3.722122498480001,
 21900.0,
 {'N Y Times': 0,
  'Sat R of Lit': 1300,
  'Booklist': 320.0,
  'Boston Transcript': 200,
  'Books': 0,
  'Times [London] Lit Sup': 500,
  'New Yorker': 0,
  'Nation': 0,
  'New Repub': 650,
  "Springf'd Republican": 0,
  'Spec': 900,
  'Wis Lib Bui': 320.0,
  'Library J': 0,
  'Outlook': 500,
  'San Francisco Chronicle': 0,
  'NY Times': 0,
  'Weekly Book Review': 0,
  'Lit R': 0,
  'Cleveland Open Shelf': 320.0,
  'Time': 950,
  'Klrkus': 0,
  'Kirkus': 0,
  'Bookm': 0,
  'Pratt': 200.0,
  'Sat R': 0,
  'Commonweal': 420,
  'Cath World': 0,
  'Christian Science Monitor': 0,
  'Book Week': 0,
  'Manchester Guardian': 500})

In [28]:
match.head()

Unnamed: 0,docid,author,title,latestcomp,in_topic_model,hathi_author,hathi_title,topicmodel_firstpub,birthyear,us_national,...,rownum,is_bestseller,bestseller_year,is_prize_author,is_retroavant,retrospective_firstpub,consensus_earliest_date,author_age,standardized_name,review_comparison
0,mdp.39015059414725,"McKenna, Stephen","Lady Lilith, | a novel: Beong the first part o...",1967.0,False,,,,,,...,895.0,False,,False,False,,1920.0,,"McKenna, Stephen",not_in_tm
1,mdp.39015063548997,"Glyn, Elinor",Visits of Elizabeth | $c: [by] Elinor Glyn.,1943.0,True,"Glyn, Elinor",Visits of Elizabeth,1943.0,1864.0,False,...,,True,1901.0,False,False,,1901.0,37.0,"Glyn, Elinor",here_for_another_reason
2,wu.89099437782,"France, Anatole",The authorized English translations of the nov...,1924.0,False,,,,,,...,,False,,True,False,,1924.0,,"France, Anatole",not_in_tm
3,nyp.33433076075344,"Lynd, Sylvia",The chorus : | a tale of love and folly / | $c...,1952.0,False,,,,,,...,819.0,False,,False,False,,1916.0,,"Lynd, Sylvia",not_in_tm
4,uc1.$b797656,"Rinehart, Mary Roberts","The romantics, | $c: by Mary Roberts Rinehart.",1958.0,True,"Rinehart, Mary Roberts",The romantics,1929.0,1876.0,True,...,,False,,False,False,,1929.0,53.0,"Rinehart, Mary Rober",unreviewed_tm_sample


In [29]:
newsentcol = []
newwordcol = []

venuecolumns = dict()
for v in venues_kept:
    venuecolumns[v] = []

for doc in match.docid:
    if doc in docdict:
        sentiment, wordcount, vdict = docdict[doc]
        for v in venues_kept:
            venuecolumns[v].append(vdict[v])
    else:
        sentiment = np.nan
        wordcount = 0
        for v in venues_kept:
            venuecolumns[v].append(0)
    
    newsentcol.append(sentiment)
    newwordcol.append(wordcount)
        

In [30]:
match.shape

(10339, 23)

In [31]:
outmatch = match.copy()

In [32]:
outmatch['review_wordcount'] = newwordcol

In [33]:
outmatch['review_sentiment'] = newsentcol

In [34]:
for v in venues_kept:
	outmatch[v] = venuecolumns[v]

In [35]:
outmatch.shape

(10339, 55)

In [36]:
outmatch.to_csv('richer_reception.tsv', sep = '\t', index = False)

In [37]:
outmatch.columns

Index(['docid', 'author', 'title', 'latestcomp', 'in_topic_model',
       'hathi_author', 'hathi_title', 'topicmodel_firstpub', 'birthyear',
       'us_national', 'authof3ormore', 'in_BRD', 'brd_review_year', 'rownum',
       'is_bestseller', 'bestseller_year', 'is_prize_author', 'is_retroavant',
       'retrospective_firstpub', 'consensus_earliest_date', 'author_age',
       'standardized_name', 'review_comparison', 'review_wordcount',
       'review_sentiment', 'N Y Times', 'Sat R of Lit', 'Booklist',
       'Boston Transcript', 'Books', 'Times [London] Lit Sup', 'New Yorker',
       'Nation', 'New Repub', 'Springf'd Republican', 'Spec', 'Wis Lib Bui',
       'Library J', 'Outlook', 'San Francisco Chronicle', 'NY Times',
       'Weekly Book Review', 'Lit R', 'Cleveland Open Shelf', 'Time', 'Klrkus',
       'Kirkus', 'Bookm', 'Pratt', 'Sat R', 'Commonweal', 'Cath World',
       'Christian Science Monitor', 'Book Week', 'Manchester Guardian'],
      dtype='object')