# Reddit Scraping

Iterate Posts on r/books and count book occurrences.

In [1]:
import datetime as dt
import re
from pathlib import Path

import pandas as pd
import praw
import yaml
from psaw import PushshiftAPI
from tqdm.auto import tqdm

NON_ALPHANUMERIC = re.compile(r'[\W_]+')
EXTRACTED_BOOKS_PATH = './data/books.csv'
SUBREDDIT_NAME = 'books'
IDS_PATH = './data/post_ids.csv'
OLDEST_POST = int(dt.datetime(2005, 6, 23).timestamp())  # Reddit founding date

## Connect  to APIs

In [2]:
with open('./keys', 'r') as file:
    keys = yaml.safe_load(file)

In [3]:
reddit = praw.Reddit(client_id=keys['client_id'],
                     client_secret=keys['client_secret'],
                     user_agent=keys['user_agent'])
ps_api = PushshiftAPI(reddit)

In [4]:
books_sub = reddit.subreddit(SUBREDDIT_NAME)

## Check already downloaded IDs

In [5]:
if Path(IDS_PATH).exists():
    ids_df = pd.read_csv(IDS_PATH, index_col=0)
    list_of_ids = list(ids_df.itertuples(index=False, name=None))
    list_of_ids.sort(key=lambda x: x[1])
    OLDEST_POST = list_of_ids[-1][1]
    print(f'Loaded {len(list_of_ids):,} posts. Newest Post: {dt.datetime.fromtimestamp(OLDEST_POST)}')

else:
    list_of_ids = []

## Get all Post IDs from Pushshift

In [6]:
time_to_end = dt.datetime(2022, 10, 1, 0, 0, 0).timestamp()
# time_to_end = int(dt.datetime(2014, 6, 25).timestamp())

In [7]:
latest_timestamp = time_to_end

In [9]:
pbar = tqdm(total=int(time_to_end - OLDEST_POST))

all_collected = False
while latest_timestamp >= OLDEST_POST and not all_collected:
    submissions = ps_api.search_submissions(after=int(OLDEST_POST),
                                            before=int(latest_timestamp),
                                            subreddit=SUBREDDIT_NAME,
                                            limit=1000)

    submissions = list(submissions)
    if len(submissions) > 0:
        olt = latest_timestamp
        latest_timestamp = int(min(submissions, key=lambda x: x.created).created)

        list_of_ids.extend(
            [(submission.id, int(submission.created)) for submission in submissions]
        )

        pbar.set_description(f'Collected {len(list_of_ids):,} post IDs.', refresh=True)
        pbar.update(olt - latest_timestamp)
        print(f'Got ID of {len(submissions)} post(s)\r')

    else:
        all_collected = True
        print('All available IDs retrieved.')

pbar.n = pbar.total
pbar.close()
pbar.clear()

  0%|          | 0/545097600 [00:00<?, ?it/s]



Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 999 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 997 post(s)
Got ID of 999 post(s)
Got ID of 997 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 998 post(s)
Got ID of 997 post(s)
Got ID of 998 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 999 post(s)
Got ID of 994 post(s)
Got ID of 999 post(s)




Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 998 post(s)
Got ID of 998 post(s)




Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 997 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 996 post(s)
Got ID of 996 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 999 post(s)
Got ID of 994 post(s)
Got ID of 999 post(s)
Got ID of 998 post(s)
Got ID of 999 post(s)
Got ID of 1000 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 999 post(s)
Got ID of 997 post(s)
Got ID of 1000 post(s)
Got ID of 996 post(s)
Got ID of 1000 post(s)
Got ID of 1000 post(s)
Got ID of 1000 post(s)
Got ID of 998 post(s)
Got ID of 1000 post(s)
Got ID of 999 post(s)
Got ID of 998 post(

In [1]:
nd = dt.datetime.fromtimestamp(max(list_of_ids, key=lambda x: x[1])[1]).strftime("%Y-%m-%d, %H:%M:%S")
ni = max(list_of_ids, key=lambda x: x[1])[0]
od = dt.datetime.fromtimestamp(min(list_of_ids, key=lambda x: x[1])[1]).strftime("%Y-%m-%d, %H:%M:%S")
oi = min(list_of_ids, key=lambda x: x[1])[0]

print(f'{len(list_of_ids)} Post ID collected.\n'
      f'Oldest post from {od} (ID={oi})\n'
      f'Newest post from {nd} (ID={ni})')

NameError: name 'dt' is not defined

In [11]:
oldest_post_on_sub = reddit.submission(id=oi)
oldest_post_on_sub.title

'Preacher Vol 1: Gone to Texas, by Garth Ennis'

### Save the retrieved IDs

In [12]:
ids_df = pd.DataFrame(list_of_ids, columns=['ID', 'Timestamp'])
ids_df = ids_df.dropna()
ids_df.to_csv(IDS_PATH)