# Extract Book Titles from The Posts

For every post id: get the actual version of the post, normalize it and search for book titles.

In [1]:
import datetime as dt
import multiprocessing
import re
from functools import partial
from pathlib import Path

import pandas as pd
import praw
import yaml
from tqdm.contrib.concurrent import process_map

from BookList.Functions import download_and_prepare_posts

NON_ALPHANUMERIC = re.compile(r'[\W_]+')
EXTRACTED_BOOKS_PATH = './data/books.csv'
SUBREDDIT_NAME = 'books'
IDS_PATH = './data/post_ids.csv'
STORED_POSTS_DF = './data/stored_posts.csv'

%load_ext autotime

time: 0 ns (started: 2022-10-12 20:55:28 +02:00)


### Connect  to API

In [2]:
with open('./keys', 'r') as file:
    keys = yaml.safe_load(file)

reddit = praw.Reddit(client_id=keys['client_id'],
                     client_secret=keys['client_secret'],
                     user_agent=keys['user_agent'])

time: 0 ns (started: 2022-10-12 20:55:28 +02:00)


In [3]:
list(reddit.subreddit('books').hot())[0].title

'The /r/books Book Club Selection + AMA for October is "Lakewood" by Megan Giddings'

time: 2.5 s (started: 2022-10-12 20:55:29 +02:00)


### Prepare Post IDs

In [4]:
post_ids_df = pd.read_csv(IDS_PATH, index_col=0)
post_ids_df

Unnamed: 0,ID,Timestamp
0,xsdsa9,1664574931
1,xsds3u,1664574919
2,xsdi3s,1664574204
3,xsdb2g,1664573697
4,xsd3qe,1664573185
...,...,...
546214,66nzq,1201327097
546215,66nbv,1201305939
546216,66mal,1201286647
546217,66lvc,1201279981


time: 187 ms (started: 2022-10-12 20:55:32 +02:00)


In [5]:
all_post_ids = set(zip(post_ids_df.ID, [dt.datetime.fromtimestamp(ts).year for ts in post_ids_df.Timestamp]))
del post_ids_df

time: 328 ms (started: 2022-10-12 20:55:33 +02:00)


### Prepare Existing Downloaded Posts

In [6]:
if Path(STORED_POSTS_DF).is_file():
    stored_posts_df = pd.read_csv(STORED_POSTS_DF, index_col=0)[['ID', 'Year', 'Post Text', 'Comment Text']]
else:
    stored_posts_df = pd.DataFrame(columns=['ID', 'Year', 'Post Text', 'Comment Text'])

stored_posts_ids = set(zip(stored_posts_df.ID, stored_posts_df.Year))
print(f'Loaded {len(stored_posts_ids):,} stored posts.')
stored_posts_df

Loaded 289581 stored posts.


  stored_posts_df = pd.read_csv(STORED_POSTS_DF, index_col=0)[['ID', 'Year', 'Post Text', 'Comment Text']]


Unnamed: 0,ID,Year,Post Text,Comment Text
0,7cexh,2008,funnyparodysffcovers,
1,72bqq,2008,pirateshortfictionfromfantasymagazineshimmerfr...,
2,6sr73,2008,reviewihatedavidsedaris,
3,6dpea,2008,sevendeadlywordsofbookreviewingnewyorktimesblog,
4,7mbbb,2008,2008discoveries,
...,...,...,...,...
289576,x3c279,2022,the2022hugoawardwinnerstobeannouncedonsunday4s...,
289577,tozgxo,2022,juststartedthevegetarianandiwishthehusbandallt...,
289578,xagcox,2022,neverjudgeabookbyitsauthorsfatherearlierthisye...,
289579,xe8nhv,2022,somebriefthoughtsonthewomenofthecoppercountrya...,


time: 859 ms (started: 2022-10-12 20:55:36 +02:00)


In [7]:
all_post_ids = list(all_post_ids - stored_posts_ids)
print(f'{len(all_post_ids):,} left.')

256638 left.
time: 125 ms (started: 2022-10-12 20:55:38 +02:00)


### Download Posts and Comments and Normalize Them

In [47]:
# https://www.reddit.com/r/redditdev/comments/rsz7za/getting_submissions_from_praw_extremely_slow/
# https://www.reddit.com/r/redditdev/comments/atrt4i/praw_is_there_a_way_to_get_an_objects/

# retrieved_posts = set()
#
# fullnames = [f't3_{p_id}' for p_id, year in all_post_ids]
# deleted = {'[removed]', '[deleted]', '[deleted by user]'}
#
# for submission, (p_id, year) in (pbar := tqdm(zip(reddit.info(fullnames=fullnames), all_post_ids), total=len(all_post_ids), unit='post(s)')):
#     pbar.set_description(f'Process post with id: {p_id}', refresh=True)
#     title = submission.title
#     post_text = submission.selftext
#
#     if post_text not in deleted:
#         comments_fullnames = [f't3_{c_id}' for c_id in submission.comments.list()]
#         all_comments = [text for comment in reddit.info(fullnames=comments_fullnames) if (text := comment.selftext) not in deleted]
#         comments_text = normalize_text(''.join(all_comments))
#
#         retrieved_posts.add((p_id, year, normalize_text(title + post_text), comments_text))
#
# print(retrieved_posts)

time: 0 ns (started: 2022-10-12 20:34:16 +02:00)


In [48]:
def split(list_to_split, number_of_resulting_parts):
    k, m = divmod(len(list_to_split), number_of_resulting_parts)
    return [list_to_split[i*k+min(i, m):(i+1)*k+min(i+1, m)] for i in range(number_of_resulting_parts)]

time: 0 ns (started: 2022-10-12 20:34:16 +02:00)


In [12]:
# Split into chunks with size=100. This maximizes throughput for the reddit api.
all_post_ids_splitted = [all_post_ids[i:i+100] for i in range(0, len(all_post_ids), 100)]

time: 0 ns (started: 2022-10-12 20:56:03 +02:00)


In [13]:
results = process_map(
    partial(download_and_prepare_posts, praw_client_id=keys['client_id'], praw_client_secret=keys['client_secret'], praw_client_ua=keys['user_agent']),
    all_post_ids_splitted,
    max_workers=multiprocessing.cpu_count(),
)

print(f'Downloaded {len(results)} posts. Discarded {len(all_post_ids) - len(results)} posts.')

  results = process_map(


  0%|          | 0/2567 [00:00<?, ?it/s]

Downloaded 2567 posts. Discarded 254071 posts.
time: 5min 45s (started: 2022-10-12 20:56:03 +02:00)


In [14]:
retrieved_posts = set().union(*results)

time: 0 ns (started: 2022-10-12 21:01:49 +02:00)


In [15]:
result_df = pd.DataFrame(retrieved_posts, columns=['ID', 'Year', 'Post Text', 'Comment Text'])
result_df

Unnamed: 0,ID,Year,Post Text,Comment Text


time: 0 ns (started: 2022-10-12 21:01:49 +02:00)


In [23]:
new_stored = pd.concat([stored_posts_df, result_df], ignore_index=True)
new_stored.replace(pd.NA, '', inplace=True)
new_stored.sort_values('Year', inplace=True, ignore_index=True)
new_stored.reset_index(inplace=True)
new_stored = new_stored[['ID', 'Year', 'Post Text', 'Comment Text']]
new_stored

Unnamed: 0,ID,Year,Post Text,Comment Text
0,7cexh,2008,funnyparodysffcovers,
1,78vn1,2008,fantasticnameforabookcompany,
2,71mzs,2008,picturedthebeautifulfarmgirlwhoinspiredthomash...,
3,7evr9,2008,askbooksredditwhatbookpublishedrecentlywillsti...,
4,683c2,2008,thegreatnessguidebyrobinsharma,
...,...,...,...,...
289576,rxfm21,2022,checkbookregister,
289577,swbabz,2022,oneitaliansummerareviewbydi,
289578,szb557,2022,hardcopyvsaudiobooktheinvisiblelifeofaddielaru...,
289579,vyia8f,2022,intexaswomencrimeauthorshavefinallyescapedthes...,


time: 234 ms (started: 2022-10-12 21:04:44 +02:00)


In [24]:
new_stored.to_csv(STORED_POSTS_DF, encoding='utf-8')

print(f'Stored {len(new_stored):,} posts')

Stored 289,581 posts
time: 1.47 s (started: 2022-10-12 21:04:57 +02:00)
