In [1]:
import pandas as pd
import re
from tqdm import tqdm 

In [2]:
from fuzzywuzzy import fuzz
from difflib import SequenceMatcher

# Prize winners in HathiTrust titlemeta.tsv

In [3]:
"""
let's open our data
+
keep only the columns with which we gonna work
"""

titlemeta = pd.read_csv("titlemeta.tsv", sep="\t", low_memory=False)
titlemeta_small = titlemeta[['author', 'title', 'docid']].drop_duplicates(['author', 'title']).reset_index(drop=True)

prizes = pd.read_csv("prizes.tsv", sep="\t", low_memory=False)

In [4]:
"""
keep only the columns with which we gonna work
"""

titlemeta_small.shape

(123371, 3)

In [5]:
"""
clean the titles from everything what follows the mojibake '| $c:' 
+
let's crerate a new column with clean titles
"""

clean_titles = []
for index, row in titlemeta_small.iterrows():
    if re.findall(" \| \$c:", row["title"]):
        slice_index = row["title"].rfind("| $c:")
        clean_title = re.sub("[\/\|\[\]]", '', row["title"][:slice_index])
        clean_titles.append(clean_title)
    else:
        clean_title = re.sub("[\/\|\[\]]", '', row["title"][:slice_index])
        clean_titles.append(clean_title)

        
titlemeta_small["clean_title"] = clean_titles
        

In [6]:
titlemeta_small

Unnamed: 0,author,title,docid,clean_title
0,"Spencer, Louise Reid",Guerrilla wife | $c: [by] Louise Reid Spencer.,mdp.39015031913893,Guerrilla wife
1,"Baker, Robert H",The suburbs : | a novel / | $c: by Robert H. B...,mdp.39015003936864,The suburbs : a novel
2,"Dickens, Charles",Edwin Drood. | $c: By Charles Dickens. With il...,mdp.39015068342305,Edwin Drood.
3,"Stretton, Hesba","Carola, | $c: by Hesba Stretton.",mdp.39015055066586,"Carola,"
4,"Stretton, Hesba",In prison & out. | $c: By Hesba Stretton.,mdp.39015055066594,In prison & out.
...,...,...,...,...
123366,,Six plays of the yiddish theatre / | $c: [tran...,umn.31951d030369342,Six plays of the yiddish theatre
123367,"Wison, Steven E. (Steven Eugene)",The ghosts of Anatolia : | an epic journey to ...,mdp.39076002906829,The ghosts of Anatolia : an epic journey to f...
123368,"Jeon, Heecheon",Subjectivity of différance : | a poiesis of de...,inu.30000127716409,Subjectivity of différance : a poiesis of dec...
123369,"Wells, H. G. (Herbert George)",Certain personal matters / | $c: by H. G. Wells.,ien.35556042103887,Certain personal matters


In [7]:
"""
let's do the fuzzy matching for the HT dataframe and another df
+
create a new dataframe with the intersections
NB: the code is partly borrowed from Wenyi's notebook
"""

def dataframes_match(titlemeta, df_to_compare, df_author, df_title, df_date,
                    n_ratio_author, n_ratio_title):

    titlemeta_ID = []
    titlemeta_author = []
    titlemeta_title = []
    prize_author = []
    prize_title = []
    prize_date = []
    ratio_author_lst = []
    ratio_title_lst = []

    """
    perhaps, it's better to create another algorythm, loop in loop works very slowly -> maybe lambda can 
    work better
    """

    for idx, row in tqdm(titlemeta.iterrows()):

        author = str(row["author"]).lower()
        title = str(row["clean_title"]).lower()

        if title.startswith('the '):        
            title = title[4:]               


        for idx2, row2 in df_to_compare.iterrows():
            author2 = str(row2["author"]).lower()
            title2 = str(row2["title"]).lower()

            if title2.startswith('the '):
                title2 = title2[4:]

            ratio_author = fuzz.ratio(author2, author)
            ratio_title = fuzz.ratio(title2, title)

            if ratio_author > n_ratio_author and ratio_title > n_ratio_title:


                titlemeta_ID.append(row["docid"])
                titlemeta_author.append(row["author"])
                titlemeta_title.append(row["clean_title"])
                prize_author.append(row2["author"])
                prize_title.append(row2["title"])
                prize_date.append(row2["prizedate"])
                ratio_author_lst.append(ratio_author)
                ratio_title_lst.append(ratio_title)

            else:
                continue

    final_df = pd.DataFrame(list(zip(titlemeta_ID, titlemeta_author, titlemeta_title, prize_author,
                                      prize_title, prize_date, ratio_author_lst, ratio_title_lst)), 
                            columns=["titlemeta_ID", "titlemeta_author", "titlemeta_title", 
                                    df_author, df_title, df_date, "ratio_author", "ratio_title"])
    return final_df

In [8]:
hathi_prize = dataframes_match(titlemeta_small, prizes, "prize_author", "prize_title", "prize_date",
                               n_ratio_author = 75, n_ratio_title = 0)

123371it [12:43, 161.68it/s]


In [9]:
"""
let's save our data to csv file
"""

hathi_prize.to_csv('list_of_matches_HT_prize_75_0.csv', sep = '\t', index = False)

In [10]:
"""
let's filter two types of authors: those, who got the Prize for the whole work and those, who got the
Prize for a particular work. The ratio of the title is important for us. 
Perhaphs, the ratio_title should be lower than 50, I set it just according to my very subjective opinion
"""
any_book = hathi_prize[hathi_prize['prize_title'] == '#matchall'].reset_index(drop=True)
specific_books = hathi_prize[hathi_prize['prize_title'] != '#matchall'].reset_index(drop=True)
specific_books = specific_books[specific_books['ratio_title'] > 50].reset_index(drop=True)

In [11]:
any_book

Unnamed: 0,titlemeta_ID,titlemeta_author,titlemeta_title,prize_author,prize_title,prize_date,ratio_author,ratio_title
0,wu.89099437782,"France, Anatole",The authorized English translations of the nov...,"France, Anatole",#matchall,1921,100,13
1,nyp.33433074955463,"Mozeen, Thomas",Young Scarron,"Mann, Thomas",#matchall,1929,77,17
2,mdp.39015010783762,"Hawkesworth, John",Almoran and Hamet: an Oriental tale. In two v...,"Galsworthy, John",#matchall,1932,79,19
3,uc2.ark+=13960=t7vm44d7m,"Hawkesworth, John",Almoran and Hamet,"Galsworthy, John",#matchall,1932,79,31
4,mdp.39015063891314,"Leland, Thomas","Longsword, Earl of Salisbury. An historical r...","Mann, Thomas",#matchall,1929,77,16
...,...,...,...,...,...,...,...,...
552,mdp.39015069362567,"Mallon, Thomas",Fellow travelers,"Mann, Thomas",#matchall,1929,85,22
553,mdp.39015079234749,"Maltman, Thomas",The night birds,"Mann, Thomas",#matchall,1929,81,9
554,inu.30000095637116,"Lewis, Sinclair",The short stories of Sinclair Lewis (1904-1949),"Lewis, Sinclair",#matchall,1930,100,15
555,mdp.39015080827390,"Emson, Thomas",Skarlet,"Mann, Thomas",#matchall,1929,80,22


In [12]:
"""
let's recheck the intersections between titles and get rid off books which are not in the prize_title column
"""

for index, row in specific_books.iterrows():
    if str(row["prize_title"]).lower() not in str(row["titlemeta_title"]).lower():
        specific_books.drop(index, inplace=True)

In [13]:
"""
let's concat two our dataframes and save them as an independenct .csv file
"""
prize_winners = pd.concat([any_book, specific_books]).reset_index(drop=True)
prize_winners.to_csv('match_prize_winners_75.csv', index = False)

# Bestsellers in HathiTrust titlemeta.tsv

In [14]:
bestsellers = pd.read_csv("new_unsworth_bestsellers.csv", low_memory=False)

In [15]:
bestsellers

Unnamed: 0,year,author,title
0,1900,"Johnston, Mary",To Have and To Hold
1,1900,"Cholmondeley, Mary",Red Pottage
2,1900,"Grant, Robert",Unleavened Bread
3,1900,"Allen, James Lane",The Reign of Law
4,1900,"Bacheller, Irving",Eben Holden
...,...,...,...
1116,1999,"Fitch, Janet",White Oleander
1117,1999,"Sparks, Nicholas",A Walk to Remember
1118,1999,"Patterson, James",Pop Goes the Weasel
1119,1999,"Cornwell, Patricia",Black Notice


In [16]:
bestsellers.rename(columns = {'year':'prizedate'}, inplace = True)

In [17]:
hathi_bestsellers = dataframes_match(titlemeta_small, bestsellers, "bestseller_author", "bestseller_title",
                               "bestseller_year", n_ratio_author = 75, n_ratio_title = 50)

123371it [4:22:59,  7.82it/s]


In [18]:
hathi_bestsellers.to_csv('list_of_matches_HT_bestsellers_75_50.csv', index = False)

In [19]:
hathi_bestsellers[hathi_bestsellers['ratio_title'] > 55].reset_index(drop=True)
# specific_books[specific_books['ratio_title'] > 50].reset_index(drop=True)

Unnamed: 0,titlemeta_ID,titlemeta_author,titlemeta_title,bestseller_author,bestseller_title,bestseller_year,ratio_author,ratio_title
0,mdp.39015063548997,"Glyn, Elinor",Visits of Elizabeth,"Glyn, Elinor",The Visits of Elizabeth,1901,100,97
1,loc.ark+=13960=t3st7r64d,"Greene, Asa",Travels in,"Greene, Graham",Travels with My Aunt,1970,80,65
2,uc2.ark+=13960=t14m9612t,"Stewart, E. M",Lillias Davenant,"Stewart, Mary",The Last Enchantment,1979,77,56
3,uc1.b3962029,"Churchill, Winston, Sir","Richard Carvel,","Churchill, Winston",Richard Carvel,1900,88,93
4,nyp.33433082272703,"Cholmondeley, Mary","Red pottage,","Cholmondeley, Mary",Red Pottage,1900,100,92
...,...,...,...,...,...,...,...,...
876,inu.30000100635840,"Beauvoir, Simone de",The mandarins,"de Beauvoir, Simone",The Mandarins,1956,84,90
877,pst.000065617298,"Grey, Zane",To the last man,"Grey, Zane",To the Last Man,1922,100,94
878,mdp.39015064953329,"Bachman, Richard",Blaze : a novel,"Bach, Richard",One: A Novel,1988,90,67
879,mdp.39015070756450,"Doctorow, E. L",Loon Lake : a novel,"Doctorow, E.L.",Loon Lake,1980,93,58
