# Counting references to bestsellers



In [24]:
# degrees_of_bestseller.py

import pandas as pd
import numpy as np
import csv

from collections import Counter

pseudonyms = dict()
with open('pseudonyms.csv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        pseudonyms[row['pseudonym']] = row['ourname']

prominence = Counter()
namestocheck = dict()

def nameinitial(astring):
    if ',' in astring:
        parts = astring.split(',')
        name = parts[0]
        if len(parts[0]) > 0:
            parts[1] = parts[1].strip()
            initial = parts[1][0]
        else:
            initial = ''
    else:
        name = astring
        initial = ''

    return name, initial

def normalize_author(author):
    global pseudonyms
    author = author.strip(', .')
    if author.startswith('Bront') and author.endswith('lotte'):
        author = "Brontë, Charlotte"
        # because bad things happen to umlauts sometimes
    if author in pseudonyms:
        author = pseudonyms[author]
        
    return author

authorset = set()
nationalities = dict()
uslists = Counter()
uklist = Counter()
reviews = Counter()

existingbest = pd.read_csv('/Users/tunder/Dropbox/GenreProject/python/reception/sales/bestsellermetadata.csv')
for i in existingbest.index:
    auth = normalize_author(existingbest.loc[i, 'author'])
    if auth in pseudonyms:
        auth = pseudonyms[auth]
    sales = existingbest.loc[i, 'sales']
    date = int(existingbest.loc[i, 'earliestdate'])
    # if date < 1901 and sales == 'best':
    #     uslists[auth] += 1
    name, initial = nameinitial(auth)
    namestocheck[(name, initial)] = auth
    authorset.add(auth)
    nationality = existingbest.loc[i, 'nationality']
    nationalities[auth] = nationality

with open('bestsellersources/HackettEBS.csv', encoding = 'utf-8') as f:
    recordids = set()
    reader = csv.DictReader(f)
    for row in reader:
        if row['recordid'] in recordids:
            continue
        else:
            auth = row['author'].strip('., ')
            date = int(row['date'])
            if date > 1849:
                uklist [auth] += 1
                authorset.add(auth)

with open('bestsellersources/QDLeavisOutline.csv', encoding = 'utf-8') as f:
    recordids = set()
    reader = csv.DictReader(f)
    for row in reader:
        if row['recordid'] in recordids:
            continue
        else:
            auth = normalize_author(row['author'])
            date = int(row['date'])
            if date > 1849:
                uklist[auth] += 1
                authorset.add(auth)

with open('bestsellersources/AltickECReaderFicPoe.csv', encoding = 'utf-8') as f:
    recordids = set()
    reader = csv.DictReader(f)
    for row in reader:
        if row['recordid'] in recordids:
            continue
        elif row['Jgenre'] != 'fic':
            continue
        else:
            auth = normalize_author(row['author'])
            date = int(row['firstpub'])
            if date > 1849:
                uklist[auth] += 1
                authorset.add(auth)

with open('bestsellersources/MottGoldenMultitudes.csv', encoding = 'utf-8') as f:
    recordids = set()
    reader = csv.DictReader(f)
    for row in reader:
        auth = normalize_author(row['author'])
        date = int(row['date'])
        if date > 1849:
            uslists[auth] += 1
            authorset.add(auth)

rows = []
for year in range (1895, 1901):
    path = 'bestsellersources/Hackett' + str(year) + '.csv'
    with open(path, encoding = 'utf-8') as f:
        reader = csv.DictReader(f)
        fields = reader.fieldnames
        lastauth = 'mmmmm'
        lasttitle = 'mmmmm'
        for row in reader:
            author = normalize_author(row['author'])
            if len(author) < 3:
                continue
            if author.startswith(lastauth):
                continue
            title = row['title'].strip(', .')
            if title.startswith(lasttitle):
                continue
            if len(author) > 4:
                lastauth = author[0:4]
            if len(title) > 4:
                lasttitle = title[0:4]
            if year < 1901:
                uslists[author] += 1
                authorset.add(auth)
                rows.append(row)

mapping = dict()

with open('/Users/tunder/Dropbox/raship/ted/code/new_unsworth_bestsellers.csv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        year = int(row['year'])
        if year > 1949:
            continue
        auth = normalize_author(row['author'])
        if auth in authorset:
            uslists[auth] += 1
        elif auth in mapping:
            auth = mapping[auth]
            uslists[auth] += 1
            authorset.add(auth)
        else:
            name, initial = nameinitial(auth)
            if (name, initial) in namestocheck:

                if auth == namestocheck[(name, initial)]:
                    uslists[auth] += 1
                else:
                    # print(auth)
                    # print(name, initial)
                    # print(namestocheck[(name, initial)])
                    user = 'y'
                    if user == 'y':
                        uslists[namestocheck[(name, initial)]] += 1
                        mapping[auth] = namestocheck[(name, initial)]
                        authorset.add(auth)
                    else:
                        uslists[auth] += 1
                        authorset.add(auth)
            else:
                uslists[auth] += 1
                authorset.add(auth)

prestigemeta = pd.read_csv('../fiction/prestigeficmeta.csv')
for i in prestigemeta.index:
    reviewed = prestigemeta.loc[i, 'tags']
    auth = normalize_author(prestigemeta.loc[i, 'author'])
    authorset.add(auth)
    nationality = prestigemeta.loc[i, 'nationality']
    if auth not in nationalities:
        nationalities[auth] = nationality
    else:
        if nationality != nationalities[auth] and not pd.isnull(nationality):
            print('error', auth, nationality, nationalities[auth])
            # but treat prestigemeta as the final authority
            nationalities[auth] = nationality
    if reviewed == 'elite':
        reviews[auth] += 1
        # alreadycounted.add(auth)

with open('../bayespost/clivebloombestsellers.csv', encoding = 'utf-8') as f:
    reader = csv.DictReader(f)
    for row in reader:
        auth = normalize_author(row['author'])
        uklist[auth] += 2
        # Note that this source gets +2, while all the others get +1.
        # That's because Bloom is usually mentioning more than one book
        # for each author, and never mentions authors more than once.
        # It's more of a "career award."
        authorset.add(auth)

isuk = dict()
isus = dict()
isother = dict()

for k, v in nationalities.items():
    if v == 'us' or v == 'ca':
        isus[k] = 1
    elif v == 'uk' or v == 'ir':
        isuk[k] = 1
    else:
        isother[k] = 1

prepframe = dict()

prepframe['uslists'] = uslists
prepframe['reviews'] = reviews
prepframe['uklist'] = uklist
prepframe['nationality'] = nationalities
prepframe['is_us'] = isus
prepframe['is_uk'] = isuk
prepframe['is_other'] = isother

authorset.remove('<blank>')
df = pd.DataFrame(prepframe, index = list(authorset))
df = df.fillna(0)
df['salesevidence'] = df.uslists + df.uklist
df.to_csv('../bayespost/counted_bestsellers.csv', index_label = 'author')

error Foxton, E us nan
error Burnett, Frances Hodgson us uk
error Du Maurier, George uk nan
error Thurston, Katherine ir uk
error Joyce, James ir uk
error Blasco Ibáñez, Vicente es spanish
error Mundy, Talbot us uk
error Arlen, Michael uk us
error Fallada, Hans ger german


In [25]:
len(df)

1177