# Cleanup the book ranking

The list contains a bunch of books that are not really books but common phrases (e.g. 'The Book'). They must be removed.

In [1]:
import re
import unidecode
from pathlib import Path

import pandas as pd

BOOK_COUNT_DF = Path('./data/book_counts_raw.csv')
BOOK_COUNT_CLEANED_DF = Path('./data/book_counts_cleaned.csv')
HAND_PICKED_AMBIVALENT_TITLES = Path('./data/faux_book_titles')
NON_ALPHANUMERIC = re.compile(r'[\W_]+')

def normalize_text(title: str) -> str:
    """
    Removes special characters, makes the string lowercase and returns the new string.
    """
    text = unidecode.unidecode(title)
    text = text.lower()
    text = re.sub(NON_ALPHANUMERIC, '', text)
    text = text.strip()

    if title[:4].lower() == 'the ':     # Starts with 'The ', 'A ' or 'An '
        text = text[3:]
    elif title[:3].lower() == 'an ':
        text = text[2:]
    elif title[:2].lower() == 'a ':
        text = text[1:]

    return text

In [2]:
book_ranking = pd.read_csv(BOOK_COUNT_DF, index_col=0)
book_ranking['Title'] = book_ranking['Title'].astype(str).str.strip('"')
book_ranking

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
0,2021,The Heir,heir,post,4084
1,2020,The Heir,heir,post,3237
2,2022,The Heir,heir,post,3153
3,2022,The Heir,heir,comment,3151
4,2018,The Heir,heir,post,2221
...,...,...,...,...,...
83564,2021,All the President's Men,allthepresidentsmen,post,1
83565,2021,Show-Stopper!,showstopper,post,1
83566,2021,The Angel of the Crows,angelofthecrows,post,1
83567,2021,The Witness for the Dead,witnessforthedead,post,1


In [3]:
# Ambivalent book titles that should be removed:
faux_titles = {normalize_text(title) for title in HAND_PICKED_AMBIVALENT_TITLES.read_text().splitlines(keepends=False)}
print(f'Collected {len(faux_titles):,} titles that have an ambivalent meaning and cannot be distinguished from normal phrases or sentences. They are removed from the ranking.')

Collected 855 titles that have an ambivalent meaning and cannot be distinguished from normal phrases or sentences. They are removed from the ranking.


In [4]:
filtered_book_ranking = book_ranking[~book_ranking['Normalized Title'].isin(faux_titles)]
filtered_book_ranking = pd.DataFrame(filtered_book_ranking.sort_values('Occurrences', ascending=False, ignore_index=True))
filtered_book_ranking.index = filtered_book_ranking.index + 1

filtered_book_ranking

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
1,2012,The Hunger Games,hungergames,post,326
2,2020,1984,1984,post,310
3,2021,Dune,dune,post,310
4,2021,1984,1984,post,297
5,2020,Dune,dune,post,286
...,...,...,...,...,...
83300,2011,Anábasis,anabasis,post,1
83301,2011,The oresteia,oresteia,post,1
83302,2011,Existentialism is a humanism,existentialismisahumanism,post,1
83303,2015,Here comes everybody,herecomeseverybody,post,1


In [5]:
# Absolute ranking:
absolute_rank = pd.DataFrame(filtered_book_ranking.groupby(['Title', 'Normalized Title']).sum()).sort_values('Occurrences', ascending=False)
absolute_rank[['Occurrences']].reset_index()

  absolute_rank = pd.DataFrame(filtered_book_ranking.groupby(['Title', 'Normalized Title']).sum()).sort_values('Occurrences', ascending=False)


Unnamed: 0,Title,Normalized Title,Occurrences
0,Dune,dune,2306
1,1984,1984,2229
2,The Hunger Games,hungergames,1176
3,Infinite jest,infinitejest,1045
4,The Hobbit,hobbit,999
...,...,...,...
21333,Chasing Fire,chasingfire,1
21334,Silent prey,silentprey,1
21335,Silent messages,silentmessages,1
21336,Silent killer,silentkiller,1


In [6]:
filtered_book_ranking.to_csv(BOOK_COUNT_CLEANED_DF)