# Extract Book Titles from The Posts

For every post id: get the actual version of the post, normalize it and search for book titles.

In [1]:
import multiprocessing
import re
from collections import Counter
from functools import partial
from pathlib import Path

import pandas as pd
from tqdm.contrib.concurrent import process_map

from Functions import normalize_text, get_occurrences_for_post

NON_ALPHANUMERIC = re.compile(r'[\W_]+')
SUBREDDIT_NAME = 'books'
EXTRACTED_BOOKS_PATH = './data/books.csv'
IDS_PATH = './data/post_ids.csv'
STORED_POSTS_PATH = './data/stored_posts.csv'
FAUX_TITLES_PATH = Path('./data/faux_book_titles')
BOOK_COUNT_DF = './data/book_counts_raw.csv'

%load_ext autotime

time: 605 µs (started: 2022-10-14 20:51:44 +02:00)


### Prepare Book Titles

And compile them into a regex.

In [2]:
book_titles_df = pd.read_csv(EXTRACTED_BOOKS_PATH, index_col=0)
book_titles_df

Unnamed: 0,Title,Normalized
0,006 and a Half,006andahalf
1,01-01-00,010100
2,05:58,0558
3,08 the Planet of the Tortoise Driver Little Pr...,08theplanetofthetortoisedriverlittleprince
4,"10,000 ways to say I love you",10000waystosayiloveyou
...,...,...
110919,Zvezdy--kholodnye igrushki,zvezdykholodnyeigrushki
110920,Zvirahwe,zvirahwe
110921,The Z Was Zapped,zwaszapped
110922,Zweite Auflage im Altertum; kulturgeschichtlic...,zweiteauflageimaltertumkulturgeschichtlichestu...


time: 147 ms (started: 2022-10-14 20:51:44 +02:00)


In [3]:
title_dict = dict(zip(book_titles_df.Normalized, book_titles_df.Title))
if FAUX_TITLES_PATH.exists():
    faux_titles = {normalize_text(faux_title) for faux_title in FAUX_TITLES_PATH.read_text().splitlines(keepends=False)}
else:
    faux_titles = set()

all_normalized_titles = list(set(book_titles_df.Normalized) - faux_titles)
all_normalized_titles.sort(reverse=True, key=lambda x: len(x))
print(f'Found {len(all_normalized_titles):,} book titles.')

Found 110,730 book titles.
time: 82.6 ms (started: 2022-10-14 20:51:45 +02:00)


In [4]:
posts_df = pd.read_csv(STORED_POSTS_PATH, index_col=0)
posts_df.fillna('', inplace=True)
posts_df

  posts_df = pd.read_csv(STORED_POSTS_PATH, index_col=0)


Unnamed: 0,ID,Year,Post Text,Comment Text
0,7cexh,2008,funnyparodysffcovers,
1,72bqq,2008,pirateshortfictionfromfantasymagazineshimmerfr...,
2,6sr73,2008,reviewihatedavidsedaris,
3,6dpea,2008,sevendeadlywordsofbookreviewingnewyorktimesblog,
4,7mbbb,2008,2008discoveries,
...,...,...,...,...
289576,x3c279,2022,the2022hugoawardwinnerstobeannouncedonsunday4s...,
289577,tozgxo,2022,juststartedthevegetarianandiwishthehusbandallt...,
289578,xagcox,2022,neverjudgeabookbyitsauthorsfatherearlierthisye...,
289579,xe8nhv,2022,somebriefthoughtsonthewomenofthecoppercountrya...,


time: 965 ms (started: 2022-10-14 20:51:45 +02:00)


In [5]:
posts = list(posts_df.sample(frac=1).itertuples(index=False, name=None))    # shuffle for better time estimate for the process bar

time: 285 ms (started: 2022-10-14 20:51:46 +02:00)


### Compile the Book Titles to RegEx

The longest book title is first, the shortest is last. The resulting RegEx always returns the longest non-overlapping matches. This way, we always match on the longest title, even if a shorter title is a substring of the long title. (e.g. Only match for 'Pride and Prejudice and Zombies' and not for 'Pride and Prejudice' in the string: 'My favourite book is "Pride and Prejudice and Zombies"')

In [6]:
title_regex = '|'.join(all_normalized_titles)
title_regex = re.compile(title_regex)

time: 4.18 s (started: 2022-10-14 20:51:46 +02:00)


### Search Posts for Book Titles

In [7]:
results = process_map(
    partial(get_occurrences_for_post, title_regex=title_regex),
    posts,
    max_workers=multiprocessing.cpu_count(),
    chunksize=250
)

  0%|          | 0/289581 [00:00<?, ?it/s]

time: 2h 53min 48s (started: 2022-10-14 20:51:50 +02:00)


In [8]:
year, post_occ, comment_occ = zip(*results)

per_year_posts = dict()
per_year_comments = dict()

for y in set(year):
    per_year_posts[y] = dict()
    per_year_comments[y] = dict()

for y, p, c in results:
    if p:
        per_year_posts[y] = Counter(per_year_posts[y]) + Counter(p)
    if c:
        per_year_comments[y] = Counter(per_year_posts[y]) + Counter(p)

time: 3min 26s (started: 2022-10-14 23:45:39 +02:00)


In [9]:
# print("Ranked from posts:", sorted(list(post_occ.items()), key=lambda x: x[1], reverse=True))
# print("Ranked from comments:", sorted(list(post_occ.items()), key=lambda x: x[1], reverse=True))
# print("Ranked from posts and comments:",
#       sorted(list((Counter(comment_occ) + Counter(post_occ))
#                   .items()), key=lambda x: x[1], reverse=True))

time: 366 µs (started: 2022-10-14 23:49:06 +02:00)


In [10]:
book_ranking = []
for y in set(year):
    book_ranking.extend([(y, title_dict[book_title], book_title, 'post', occurrences)
                    for book_title, occurrences in per_year_posts[y].items()])
    book_ranking.extend([(y, title_dict[book_title], book_title, 'comment', occurrences)
                         for book_title, occurrences in per_year_comments[y].items()])

time: 128 ms (started: 2022-10-14 23:49:06 +02:00)


In [11]:
book_ranking

[(2016, 'The Heir', 'heir', 'post', 1961),
 (2016, 'Addicted', 'addicted', 'post', 24),
 (2016, 'Tron', 'tron', 'post', 738),
 (2016, 'The Omen', 'omen', 'post', 1132),
 (2016, 'Owed', 'owed', 'post', 276),
 (2016, 'Poemas', 'poemas', 'post', 8),
 (2016, 'Namesake', 'namesake', 'post', 10),
 (2016, 'Discus', 'discus', 'post', 1098),
 (2016, 'How to make money', 'howtomakemoney', 'post', 44),
 (2016, 'The coma', 'coma', 'post', 282),
 (2016, 'Rant', 'rant', 'post', 435),
 (2016, 'Erec', 'erec', 'post', 612),
 (2016, 'Need to know', 'needtoknow', 'post', 46),
 (2016, 'Comet', 'comet', 'post', 540),
 (2016, 'Head on', 'headon', 'post', 59),
 (2016, 'Infinite jest', 'infinitejest', 'post', 172),
 (2016, 'Two Years Ago', 'twoyearsago', 'post', 18),
 (2016, 'The Tain', 'tain', 'post', 1240),
 (2016, 'Where you are', 'whereyouare', 'post', 8),
 (2016, 'Hell', 'hell', 'post', 764),
 (2016, 'One Day', 'oneday', 'post', 63),
 (2016, 'Mean Girl', 'meangirl', 'post', 3),
 (2016, 'The pact', 'pact'

time: 55.6 ms (started: 2022-10-14 23:49:06 +02:00)


In [12]:
merged_frame = pd.DataFrame(book_ranking, columns=['Year', 'Title', 'Normalized Title', 'Post or Comment', 'Occurrences'])
merged_frame.sort_values('Occurrences', inplace=True, ascending=False, ignore_index=False)
merged_frame.reset_index(drop='index', inplace=True)
merged_frame

Unnamed: 0,Year,Title,Normalized Title,Post or Comment,Occurrences
0,2021,The Heir,heir,post,4080
1,2020,The Heir,heir,post,3236
2,2022,The Heir,heir,post,3153
3,2022,The Heir,heir,comment,3130
4,2021,IONI,ioni,post,2615
...,...,...,...,...,...
89779,2021,Skeletons on the Zahara,skeletonsonthezahara,post,1
89780,2021,White Gold,whitegold,post,1
89781,2021,The church mouse,churchmouse,post,1
89782,2021,Vampire's kiss,vampireskiss,post,1


time: 1.62 s (started: 2022-10-14 23:49:06 +02:00)


In [13]:
merged_frame.to_csv(BOOK_COUNT_DF)

time: 287 ms (started: 2022-10-14 23:49:08 +02:00)
