# Cleanup the book ranking

The list contains a bunch of books that are not really books but common phrases (e.g. 'The Book'). They must be removed.

In [134]:
import re
import unidecode
from pathlib import Path

import pandas as pd

BOOK_COUNT_DF = Path('./data/book_counts_raw.csv')
BOOK_COUNT_CLEANED_DF = Path('./data/book_counts_cleaned.csv')
HAND_PICKED_AMBIVALENT_TITLES = Path('./data/faux_book_titles')
NON_ALPHANUMERIC = re.compile(r'[\W_]+')

def normalize_text(title: str) -> str:
    """
    Removes special characters, makes the string lowercase and returns the new string.
    """
    text = unidecode.unidecode(title)
    text = text.lower()
    text = re.sub(NON_ALPHANUMERIC, '', text)
    text = text.strip()

    if title[:4].lower() == 'the ':     # Starts with 'The ', 'A ' or 'An '
        text = text[3:]
    elif title[:3].lower() == 'an ':
        text = text[2:]
    elif title[:2].lower() == 'a ':
        text = text[1:]

    return text

In [135]:
book_ranking = pd.read_csv(BOOK_COUNT_DF, index_col=0)
book_ranking['Title'] = book_ranking['Title'].astype(str).str.strip('"')
book_ranking

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
0,2021,The Heir,heir,post,4080
1,2020,The Heir,heir,post,3236
2,2022,The Heir,heir,post,3153
3,2022,The Heir,heir,comment,3130
4,2021,IONI,ioni,post,2615
...,...,...,...,...,...
89779,2021,Skeletons on the Zahara,skeletonsonthezahara,post,1
89780,2021,White Gold,whitegold,post,1
89781,2021,The church mouse,churchmouse,post,1
89782,2021,Vampire's kiss,vampireskiss,post,1


In [136]:
# Ambivalent book titles that should be removed:
faux_titles = {normalize_text(title) for title in HAND_PICKED_AMBIVALENT_TITLES.read_text().splitlines(keepends=False)}
print(f'Collected {len(faux_titles):,} titles that have an ambivalent meaning and cannot be distinguished from normal phrases or sentences. They are removed from the ranking.')

Collected 852 titles that have an ambivalent meaning and cannot be distinguished from normal phrases or sentences. They are removed from the ranking.


In [137]:
filtered_book_ranking = book_ranking[~book_ranking['Normalized Title'].isin(faux_titles)]
filtered_book_ranking = pd.DataFrame(filtered_book_ranking.sort_values('Occurrences', ascending=False, ignore_index=True))
filtered_book_ranking.index = filtered_book_ranking.index + 1

filtered_book_ranking

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
1,2012,The Hunger Games,hungergames,post,326
2,2021,Dune,dune,post,311
3,2020,1984,1984,post,310
4,2021,1984,1984,post,297
5,2020,Dune,dune,post,286
...,...,...,...,...,...
83037,2011,A Giraffe and a Half,giraffeandahalf,post,1
83038,2012,Childhood's End,childhoodsend,post,1
83039,2015,Tales of Nevèrÿon,talesofneveryon,post,1
83040,2012,Fifth Avenue,fifthavenue,post,1


In [138]:
# Absolute ranking:
absolute_rank = pd.DataFrame(filtered_book_ranking.groupby(['Title', 'Normalized Title']).sum()).sort_values('Occurrences', ascending=False)
absolute_rank[['Occurrences']].reset_index()

Unnamed: 0,Title,Normalized Title,Occurrences
0,Dune,dune,2306
1,1984,1984,2230
2,The Hunger Games,hungergames,1174
3,Infinite jest,infinitejest,1040
4,The Hobbit,hobbit,999
...,...,...,...
21293,The 22 immutable laws of branding,22immutablelawsofbranding,1
21294,The 22 Murders of Madison May,22murdersofmadisonmay,1
21295,The 2-hour job search,2hourjobsearch,1
21296,The 13th valley,13thvalley,1


In [139]:
filtered_book_ranking.to_csv(BOOK_COUNT_CLEANED_DF)