In [35]:
import os
import praw
import requests
import pandas
from datetime import datetime
from pprint import pprint
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

# Reddit Data Experiments - Upanshu Parekh (uparekh2)
For this part I will be experimenting with PRAW and the PullPush API to scrape the top 100 posts of January 2024 for the [r/wallstreetbets](https://www.reddit.com/r/wallstreetbets/) subreddit. This part is simply just an exploration of the libraries / APIs. Once I understand the general approach, there'll be another Jupyter notebook extracting just the data I need.

- [r/finance](https://www.reddit.com/r/finance/)
- [r/personalfinance](https://www.reddit.com/r/personalfinance/)
- [r/investing](https://www.reddit.com/r/investing/)

## PRAW Attempts

In [2]:
reddit = praw.Reddit(
    client_id=os.getenv('PRAW_KEY'),
    client_secret=os.getenv('PRAW_SECRET'),
    username=os.getenv('PRAW_USERNAME'),
    password=os.getenv('PRAW_PASSWORD'),
    user_agent='scraper by /u/blc5_'
)

In [3]:
reddit.user.me()

Redditor(name='blc5_')

In [4]:
wsb = reddit.subreddit("wallstreetbets")

In [5]:
pprint(vars(wsb))

{'_fetched': False,
 '_path': 'r/wallstreetbets/',
 '_reddit': <praw.reddit.Reddit object at 0x104ac0950>,
 'display_name': 'wallstreetbets'}


In [7]:
# Get the top submission from the subreddit for viewing dict keys
for submission in wsb.top(limit=1):
    pprint(vars(submission))

{'_additional_fetch_params': {},
 '_comments_by_id': {},
 '_fetched': False,
 '_reddit': <praw.reddit.Reddit object at 0x104ac0950>,
 'all_awardings': [],
 'allow_live_comments': True,
 'approved_at_utc': None,
 'approved_by': None,
 'archived': False,
 'author': Redditor(name='SomeGuyInDeutschland'),
 'author_flair_background_color': '',
 'author_flair_css_class': None,
 'author_flair_richtext': [{'e': 'text', 't': 'TC or GTFO'}],
 'author_flair_template_id': None,
 'author_flair_text': 'TC or GTFO',
 'author_flair_text_color': 'dark',
 'author_flair_type': 'richtext',
 'author_fullname': 't2_cxhbp',
 'author_is_blocked': False,
 'author_patreon_flair': False,
 'author_premium': True,
 'awarders': [],
 'banned_at_utc': None,
 'banned_by': None,
 'can_gild': False,
 'can_mod_post': False,
 'category': None,
 'clicked': False,
 'comment_limit': 2048,
 'comment_sort': 'confidence',
 'content_categories': None,
 'contest_mode': False,
 'created': 1612029638.0,
 'created_utc': 1612029638.0

## PullPush Attempt
Apparently this service provides a more convenient API for read-only data between old Reddit data that used to be provided by a holistic service called PushShift and augmenting it with the Reddit API.

It doesn't go against Reddit's TOS, apparently. It's been up for a year or so, read up on their website to see more: https://pullpush.io

In [38]:
# Define the dates
date_start = datetime(2024, 1, 1)
date_end = datetime(2024, 1, 31)

# Convert to timestamps
ts_beg = int(date_start.timestamp())
ts_end = int(date_end.timestamp())

print(f"Timestamp for 01-01-2024: {ts_beg}")
print(f"Timestamp for 01-31-2024: {ts_end}")

Timestamp for 01-01-2024: 1704088800
Timestamp for 01-31-2024: 1706680800


In [39]:
response = requests.get(f'https://api.pullpush.io/reddit/submission/search/?subreddit=wallstreetbets&after={ts_beg}&before={ts_end}&sort_type=score&sort=desc&size=100')
response.raise_for_status()  # Raise an error for bad responses
data = response.json()

In [None]:
# response = requests.get('https://api.pullpush.io/reddit/submission/search/?ids=1hv1wuj')
# response.raise_for_status()  # Raise an error for bad responses
# data = response.json()

In [None]:
# subm = data['data'][0]
# subm.keys()
# # subm['selftext'] contains the text of the submission

dict_keys(['approved_at_utc', 'subreddit', 'selftext', 'author_fullname', 'saved', 'mod_reason_title', 'gilded', 'clicked', 'title', 'link_flair_richtext', 'subreddit_name_prefixed', 'hidden', 'pwls', 'link_flair_css_class', 'downs', 'thumbnail_height', 'top_awarded_type', 'hide_score', 'name', 'quarantine', 'link_flair_text_color', 'upvote_ratio', 'author_flair_background_color', 'subreddit_type', 'ups', 'total_awards_received', 'media_embed', 'thumbnail_width', 'author_flair_template_id', 'is_original_content', 'user_reports', 'secure_media', 'is_reddit_media_domain', 'is_meta', 'category', 'secure_media_embed', 'link_flair_text', 'can_mod_post', 'score', 'approved_by', 'is_created_from_ads_ui', 'author_premium', 'thumbnail', 'edited', 'author_flair_css_class', 'author_flair_richtext', 'gildings', 'post_hint', 'content_categories', 'is_self', 'mod_note', 'created', 'link_flair_type', 'wls', 'removed_by_category', 'banned_by', 'author_flair_type', 'domain', 'allow_live_comments', 'sel

In [None]:
num_selftext = 0
for subm in data['data']:
    if subm['selftext']:
        num_selftext += 1
    # print(f"Id: {subm['id']}")
    # print(f"Title: {subm['title']}")
    # print(f"Score: {subm['score']}")
    # print(f"Created UTC: {datetime.fromtimestamp(subm['created_utc'])}")
    # print(f"URL: {subm['url']}")
    # print(f"Selftext: {subm['selftext'][:100]}...")  # Print first 100 chars of selftext
    # print("-" * 40)

print(f"Number of submissions with selftext: {num_selftext}")
print(f"Number of submissions without selftext: {len(data['data']) - num_selftext}")

Number of submissions with selftext: 32
Number of submissions without selftext: 68


Not that many submissions without selftext, maybe a good idea would be to get the text of the top 10 comments for each post and include it in the data for a submission. Maybe I'll combine title + selftext + all comment texts as the corpus for each submission.

I think the final columns for each submission entry could then be:
- Subreddit
- Month
- Submission ID
- Submission Title
- Submission
- Top Comment 1
- Top Comment 2
- ...
- Top Comment 10


In [None]:
sub0 = data['data'][0]
sub0_id = sub0['id']
top10comments = []
try:
    resp = requests.get(f'https://api.pullpush.io/reddit/comment/search/?link_id={sub0_id}&sort_type=score&sort=desc&size=10')
    resp.raise_for_status()
    top10comments = resp.json()
except requests.exceptions.RequestException as e:
    print(f"Error fetching comments: {e}")


In [48]:
len(top10comments['data'])

10

In [53]:
com0 = top10comments['data'][2]
com0.keys()

dict_keys(['subreddit_id', 'approved_at_utc', 'author_is_blocked', 'comment_type', 'edited', 'mod_reason_by', 'banned_by', 'ups', 'num_reports', 'author_flair_type', 'total_awards_received', 'subreddit', 'author_flair_template_id', 'likes', 'replies', 'user_reports', 'saved', 'id', 'banned_at_utc', 'mod_reason_title', 'gilded', 'archived', 'collapsed_reason_code', 'no_follow', 'author', 'can_mod_post', 'send_replies', 'parent_id', 'score', 'author_fullname', 'report_reasons', 'removal_reason', 'approved_by', 'all_awardings', 'body', 'awarders', 'top_awarded_type', 'downs', 'author_flair_css_class', 'author_patreon_flair', 'collapsed', 'author_flair_richtext', 'is_submitter', 'body_html', 'gildings', 'collapsed_reason', 'associated_award', 'stickied', 'author_premium', 'can_gild', 'link_id', 'unrepliable_reason', 'author_flair_text_color', 'score_hidden', 'permalink', 'subreddit_type', 'locked', 'name', 'created', 'author_flair_text', 'treatment_tags', 'created_utc', 'subreddit_name_pre

In [54]:
com0['body']

"It's like minority report, the crash was priced in.before it even happened. Sorry, bro, but you'll die for my puts."