# Extract Book Titles from The Posts

For every post id: get the actual version of the post, normalize it and search for book titles.

In [1]:
import multiprocessing
import re
from collections import Counter
from functools import partial
from pathlib import Path

import pandas as pd
from tqdm.contrib.concurrent import process_map

from Functions import normalize_text, get_occurrences_for_post

NON_ALPHANUMERIC = re.compile(r'[\W_]+')
SUBREDDIT_NAME = 'books'
EXTRACTED_BOOKS_PATH = './data/books.csv'
IDS_PATH = './data/post_ids.csv'
STORED_POSTS_PATH = './data/stored_posts.csv'
FAUX_TITLES_PATH = Path('./data/faux_book_titles')
BOOK_COUNT_DF = './data/book_counts_raw.csv'

%load_ext autotime

time: 552 µs (started: 2022-10-30 11:57:04 +01:00)


### Prepare Book Titles

And compile them into a regex.

In [2]:
book_titles_df = pd.read_csv(EXTRACTED_BOOKS_PATH, index_col=0)
book_titles_df

Unnamed: 0,Title,Normalized
0,006 and a Half,006andahalf
1,01-01-00,010100
2,05:58,0558
3,08 the Planet of the Tortoise Driver Little Pr...,08theplanetofthetortoisedriverlittleprince
4,"10,000 ways to say I love you",10000waystosayiloveyou
...,...,...
113485,Zvezdy--kholodnye igrushki,zvezdykholodnyeigrushki
113486,Zvirahwe,zvirahwe
113487,The Z Was Zapped,zwaszapped
113488,Zweite Auflage im Altertum; kulturgeschichtlic...,zweiteauflageimaltertumkulturgeschichtlichestu...


time: 680 ms (started: 2022-10-30 11:57:04 +01:00)


In [3]:
title_dict = dict(zip(book_titles_df.Normalized, book_titles_df.Title))
if FAUX_TITLES_PATH.exists():
    faux_titles = {normalize_text(faux_title) for faux_title in FAUX_TITLES_PATH.read_text().splitlines(keepends=False)}
else:
    faux_titles = set()

all_normalized_titles = list(set(book_titles_df.Normalized) - faux_titles)
all_normalized_titles.sort(reverse=True, key=lambda x: len(x))
print(f'Found {len(all_normalized_titles):,} book titles.')

Found 113,231 book titles.
time: 167 ms (started: 2022-10-30 11:57:05 +01:00)


In [4]:
posts_df = pd.read_csv(STORED_POSTS_PATH, index_col=0)
posts_df.fillna('', inplace=True)
posts_df

  posts_df = pd.read_csv(STORED_POSTS_PATH, index_col=0)


Unnamed: 0,ID,Year,Post Text,Comment Text
0,7cexh,2008,funnyparodysffcovers,
1,72bqq,2008,pirateshortfictionfromfantasymagazineshimmerfr...,
2,6sr73,2008,reviewihatedavidsedaris,
3,6dpea,2008,sevendeadlywordsofbookreviewingnewyorktimesblog,
4,7mbbb,2008,2008discoveries,
...,...,...,...,...
289576,x3c279,2022,the2022hugoawardwinnerstobeannouncedonsunday4s...,
289577,tozgxo,2022,juststartedthevegetarianandiwishthehusbandallt...,
289578,xagcox,2022,neverjudgeabookbyitsauthorsfatherearlierthisye...,
289579,xe8nhv,2022,somebriefthoughtsonthewomenofthecoppercountrya...,


time: 1.26 s (started: 2022-10-30 11:57:05 +01:00)


In [5]:
posts = list(posts_df.sample(frac=1).itertuples(index=False, name=None))    # shuffle for better time estimate for the process bar

time: 448 ms (started: 2022-10-30 11:57:06 +01:00)


### Compile the Book Titles to RegEx

The longest book title is first, the shortest is last. The resulting RegEx always returns the longest non-overlapping matches. This way, we always match on the longest title, even if a shorter title is a substring of the long title. (e.g. Only match for 'Pride and Prejudice and Zombies' and not for 'Pride and Prejudice' in the string: 'My favourite book is "Pride and Prejudice and Zombies"')

In [6]:
title_regex = '|'.join(all_normalized_titles)
title_regex = re.compile(title_regex)

time: 4.3 s (started: 2022-10-30 11:57:07 +01:00)


### Search Posts for Book Titles

In [7]:
results = process_map(
    partial(get_occurrences_for_post, title_regex=title_regex),
    posts,
    max_workers=multiprocessing.cpu_count()//2,
    chunksize=250
)

  0%|          | 0/289581 [00:00<?, ?it/s]

time: 2h 44min 2s (started: 2022-10-30 11:57:11 +01:00)


In [8]:
year, post_occ, comment_occ = zip(*results)

per_year_posts = dict()
per_year_comments = dict()

for y in set(year):
    per_year_posts[y] = dict()
    per_year_comments[y] = dict()

for y, p, c in results:
    if p:
        per_year_posts[y] = Counter(per_year_posts[y]) + Counter(p)
    if c:
        per_year_comments[y] = Counter(per_year_posts[y]) + Counter(p)

time: 3min 44s (started: 2022-10-30 14:41:14 +01:00)


In [9]:
# print("Ranked from posts:", sorted(list(post_occ.items()), key=lambda x: x[1], reverse=True))
# print("Ranked from comments:", sorted(list(post_occ.items()), key=lambda x: x[1], reverse=True))
# print("Ranked from posts and comments:",
#       sorted(list((Counter(comment_occ) + Counter(post_occ))
#                   .items()), key=lambda x: x[1], reverse=True))

time: 385 µs (started: 2022-10-30 14:44:58 +01:00)


In [10]:
book_ranking = []
for y in set(year):
    book_ranking.extend([(y, title_dict[book_title], book_title, 'post', occurrences)
                    for book_title, occurrences in per_year_posts[y].items()])
    book_ranking.extend([(y, title_dict[book_title], book_title, 'comment', occurrences)
                         for book_title, occurrences in per_year_comments[y].items()])

time: 106 ms (started: 2022-10-30 14:44:58 +01:00)


In [11]:
book_ranking

[(2016, 'New Moon', 'newmoon', 'post', 5),
 (2016, 'Auden', 'auden', 'post', 6),
 (2016, 'Emile', 'emile', 'post', 16),
 (2016, 'Komarr', 'komarr', 'post', 1),
 (2016, 'The apprentice', 'apprentice', 'post', 9),
 (2016, 'Mirror Dance', 'mirrordance', 'post', 1),
 (2016, 'On Man', 'onman', 'post', 44),
 (2016, 'The Long Run', 'longrun', 'post', 9),
 (2016, 'Mater', 'mater', 'post', 215),
 (2016, 'Fluff', 'fluff', 'post', 11),
 (2016, 'Critical Mass', 'criticalmass', 'post', 2),
 (2016, 'Spaced Out', 'spacedout', 'post', 3),
 (2016, 'Sting', 'sting', 'post', 1654),
 (2016, 'Hungry for more', 'hungryformore', 'post', 1),
 (2016, 'Everyman', 'everyman', 'post', 18),
 (2016, 'The girl of his dreams', 'girlofhisdreams', 'post', 1),
 (2016, 'At-Risk', 'atrisk', 'post', 4),
 (2016, 'Unwell', 'unwell', 'post', 2),
 (2016, 'Dread', 'dread', 'post', 2217),
 (2016, 'The triple package', 'triplepackage', 'post', 1),
 (2016, 'The Chinaman', 'chinaman', 'post', 5),
 (2016, 'Obsidian', 'obsidian', 'po

time: 53.6 ms (started: 2022-10-30 14:44:58 +01:00)


In [12]:
merged_frame = pd.DataFrame(book_ranking, columns=['Year', 'Title', 'Normalized Title', 'Post or Comment', 'Occurrences'])
merged_frame.sort_values('Occurrences', inplace=True, ascending=False, ignore_index=False)
merged_frame.reset_index(drop='index', inplace=True)
merged_frame

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
0,2020,Dread,dread,post,6361
1,2021,Dread,dread,post,5834
2,2018,Dread,dread,post,3411
3,2022,Dread,dread,post,3235
4,2022,Dread,dread,comment,3224
...,...,...,...,...,...
101553,2021,Small Victories,smallvictories,post,1
101554,2021,Dark days,darkdays,post,1
101555,2021,Africa's Tarnished Name,africastarnishedname,post,1
101556,2021,Perfect strangers,perfectstrangers,post,1


time: 1.54 s (started: 2022-10-30 14:44:58 +01:00)


In [13]:
merged_frame.to_csv(BOOK_COUNT_DF)

time: 279 ms (started: 2022-10-30 14:45:00 +01:00)
