# Book Preparation

Processing the list of books to extract titles and authors. 'Normalize' them to make it easier to identify them in the reddit posts (to ignore all whitespaces and special characters)

In [1]:
import json
import re
from pathlib import Path

import pandas as pd

from Functions import normalize_text

NON_ALPHANUMERIC = re.compile(r'[^a-zA-Z0-9]+')
OL_BOOK_DUMP = Path('./data/ol_dump_works_2022-09-30.txt')
OL_RATINGS_DUMP = Path('./data/ol_dump_ratings_2022-09-30.txt')
EXTRACTED_BOOKS_PATH = Path('./data/books.csv')
MOST_FREQUENT_WORDS = Path('./data/most_common_words')
FAUX_BOOK_TITLES = Path('./data/faux_book_titles')

%load_ext autotime

time: 0 ns (started: 2022-10-27 20:17:27 +02:00)


In [2]:
mult_spaces = re.compile(r'\s\s+')
def clean_title(title: str) -> str:
    """
    Remove newlines etc. from the titles. This is NOT equal to normalize_text(...)
    :param title: A book title.
    :return: Cleaned book title.
    """
    return mult_spaces.sub(' ', title.replace('\n', '').replace('\r', '')).strip()

time: 0 ns (started: 2022-10-27 20:17:27 +02:00)


In [3]:
with open(OL_BOOK_DUMP, encoding='utf-8') as book_list:
    complete = book_list.readlines()

time: 21.7 s (started: 2022-10-27 20:17:27 +02:00)


In [4]:
with open(OL_RATINGS_DUMP, encoding='utf-8') as ratings_list:
    ratings = ratings_list.readlines()

time: 31 ms (started: 2022-10-27 20:17:48 +02:00)


In [5]:
with open(MOST_FREQUENT_WORDS, encoding='utf-8') as word_list:
    most_common_words = {word.strip() for word in word_list.readlines()}

time: 0 ns (started: 2022-10-27 20:17:48 +02:00)


In [6]:
# This is a list that compiles words/phrases collected during the development process that are very common and
# can't be classified as book title with certainty when found in a post.
if FAUX_BOOK_TITLES.exists() and FAUX_BOOK_TITLES.is_file():
    with open(FAUX_BOOK_TITLES, encoding='utf-8') as word_list:
        faux_titles = {word.strip() for word in word_list.readlines()}

most_common_words |= faux_titles

time: 0 ns (started: 2022-10-27 20:17:48 +02:00)


In [7]:
print(f'Found {len(most_common_words):,} phrases and words that are too common and are ignored as titles (e.g. "she", "and", ...)')

Found 10,754 phrases and words that are too common and are ignored as titles (e.g. "she", "and", ...)
time: 578 ms (started: 2022-10-27 20:17:49 +02:00)


In [8]:
# Remove all books that were never rated or only rated badly
def extract_rating(entry):
    entry = entry.split("\t")
    return entry[0], int(entry[2])

min_rating = 2
rated_books = {rating[0] for entry in ratings if (rating := extract_rating(entry)) and rating[1] > min_rating}
print(f'Found {len(rated_books):,} books with at least one rating of {min_rating} or more in open library.')

Found 140,288 books with at least one rating of 2 or more in open library.
time: 125 ms (started: 2022-10-27 20:17:49 +02:00)


In [9]:
from tqdm.auto import tqdm

title_set = set()

# Remove all books that never got a rating on open library, have less than 6 or more than 100 characters in the title and not have an authors name as title.
def process_entry(entry):
    entry = entry.split('\t')[4]
    entry = json.loads(entry)
    if 'title' in entry and 'key' in entry and entry['key'].strip() in rated_books:
        title = clean_title(entry['title'])
        normalized_title = normalize_text(title)

        if title[:4].lower() == 'the ':     # Starts with 'The ', 'A ' or 'An '
            normalized_title = normalized_title[3:]
        elif title[:3].lower() == 'an ':
            normalized_title = normalized_title[2:]
        elif title[:2].lower() == 'a ':
            normalized_title = normalized_title[1:]

        if normalized_title and (4 < len(normalized_title) < 100 or (normalized_title.isnumeric() and 3 < len(normalized_title) < 20)) \
                and normalized_title not in most_common_words and normalized_title not in title_set:
            title_set.add(normalized_title)
            return title, normalized_title

as_list = [x for entry in tqdm(complete, unit='Titles') if (x := process_entry(entry))]
print(f'Collected the titles of {len(as_list):,} books. All other {len(complete) - len(as_list):,} books were discarded.')

  0%|          | 0/26201547 [00:00<?, ?Titles/s]

Collected the titles of 113,490 books. All other 26,088,057 books were discarded.
time: 2min 11s (started: 2022-10-27 20:17:49 +02:00)


In [10]:
print(f'{len(as_list):,} books indexed.')
as_list.sort(key=lambda x: x[1])
as_list[:5]

113,490 books indexed.


[('006 and a Half', '006andahalf'),
 ('01-01-00', '010100'),
 ('05:58', '0558'),
 ('08 the Planet of the Tortoise Driver Little Prince',
  '08theplanetofthetortoisedriverlittleprince'),
 ('10,000 ways to say I love you', '10000waystosayiloveyou')]

time: 63 ms (started: 2022-10-27 20:20:00 +02:00)


In [11]:
f = list(filter(lambda x: len(x[1]) > 15, as_list))

time: 32 ms (started: 2022-10-27 20:20:00 +02:00)


In [12]:
books_df = pd.DataFrame(as_list, columns=['Title', 'Normalized'])

time: 31 ms (started: 2022-10-27 20:20:01 +02:00)


In [13]:
books_df.to_csv(EXTRACTED_BOOKS_PATH, encoding='utf-8')

time: 187 ms (started: 2022-10-27 20:20:01 +02:00)
