# Cleanup the book ranking

The list contains a bunch of books that are not really books but common phrases (e.g. 'The Book'). They must be removed.

In [1]:
import re
import unidecode
from pathlib import Path

import pandas as pd

BOOK_COUNT_DF = Path('./data/book_counts_raw.csv')
BOOK_COUNT_CLEANED_DF = Path('./data/book_counts_cleaned.csv')
HAND_PICKED_AMBIVALENT_TITLES = Path('./data/faux_book_titles')
NON_ALPHANUMERIC = re.compile(r'[\W_]+')

def normalize_text(title: str) -> str:
    """
    Removes special characters, makes the string lowercase and returns the new string.
    """
    text = unidecode.unidecode(title)
    text = text.lower()
    text = re.sub(NON_ALPHANUMERIC, '', text)
    text = text.strip()

    if title[:4].lower() == 'the ':     # Starts with 'The ', 'A ' or 'An '
        text = text[3:]
    elif title[:3].lower() == 'an ':
        text = text[2:]
    elif title[:2].lower() == 'a ':
        text = text[1:]

    return text

In [2]:
book_ranking = pd.read_csv(BOOK_COUNT_DF, index_col=0)
book_ranking['Title'] = book_ranking['Title'].astype(str).str.strip('"')
book_ranking

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
0,2020,Dread,dread,post,6361
1,2021,Dread,dread,post,5834
2,2018,Dread,dread,post,3411
3,2022,Dread,dread,post,3235
4,2022,Dread,dread,comment,3224
...,...,...,...,...,...
101553,2021,Small Victories,smallvictories,post,1
101554,2021,Dark days,darkdays,post,1
101555,2021,Africa's Tarnished Name,africastarnishedname,post,1
101556,2021,Perfect strangers,perfectstrangers,post,1


In [3]:
# Ambivalent book titles that should be removed:
faux_titles = {normalize_text(title) for title in HAND_PICKED_AMBIVALENT_TITLES.read_text(encoding='utf-8').splitlines(keepends=False)}
print(f'Collected {len(faux_titles):,} titles that have an ambivalent meaning and cannot be distinguished from normal phrases or sentences. They are removed from the ranking.')

Collected 1,043 titles that have an ambivalent meaning and cannot be distinguished from normal phrases or sentences. They are removed from the ranking.


In [4]:
filtered_book_ranking = book_ranking[~book_ranking['Normalized Title'].isin(faux_titles)]
filtered_book_ranking = pd.DataFrame(filtered_book_ranking.sort_values('Occurrences', ascending=False, ignore_index=True))
filtered_book_ranking.index = filtered_book_ranking.index + 1

filtered_book_ranking

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
1,2016,Harry Potter,harrypotter,post,390
2,2012,The Hunger Games,hungergames,post,326
3,2020,1984,1984,post,310
4,2015,Harry Potter,harrypotter,post,310
5,2021,1984,1984,post,297
...,...,...,...,...,...
98801,2011,The origin of Satan,originofsatan,post,1
98802,2011,Smilla's sense of snow,smillassenseofsnow,post,1
98803,2011,Little Women and me,littlewomenandme,post,1
98804,2011,The honest truth,honesttruth,post,1


In [5]:
# Absolute ranking:
absolute_rank = pd.DataFrame(filtered_book_ranking.groupby(['Title', 'Normalized Title']).sum()).sort_values('Occurrences', ascending=False)
absolute_rank[['Occurrences']].reset_index()

  absolute_rank = pd.DataFrame(filtered_book_ranking.groupby(['Title', 'Normalized Title']).sum()).sort_values('Occurrences', ascending=False)


Unnamed: 0,Title,Normalized Title,Occurrences
0,Harry Potter,harrypotter,3151
1,1984,1984,2232
2,The Lord of the Rings,lordoftherings,1225
3,The Hunger Games,hungergames,1176
4,Bible,bible,1067
...,...,...,...
23482,Black holes and time warps,blackholesandtimewarps,1
23483,Black heather,blackheather,1
23484,Scar night,scarnight,1
23485,Scared Stiff,scaredstiff,1


In [6]:
filtered_book_ranking.to_csv(BOOK_COUNT_CLEANED_DF)