In [1]:
import requests
import csv
from datetime import datetime

# Data Collection
Now I'll make the data and export it to CSV using knowledge gained in experiments.
For the top 100 submissions of each month of 2024 from the following subreddits:
- [r/wallstreetbets](https://www.reddit.com/r/wallstreetbets/)
- [r/finance](https://www.reddit.com/r/finance/)
- [r/personalfinance](https://www.reddit.com/r/personalfinance/)
- [r/investing](https://www.reddit.com/r/investing/)

What I need is a dataframe with the following columns:
- Subreddit
- Month
- Submission ID
- Submission Title
- Submission Selftext
- Body of Top Comment 1
- Body of Top Comment 2
- ...
- Body of Top Comment 10

And I'll use the Pullpush API discovered while experimenting to make it all work.


In [2]:
months = {
    'Jan': (datetime(2024, 1, 1), datetime(2024, 1, 31)),
    'Feb': (datetime(2024, 2, 1), datetime(2024, 2, 29)),
    'Mar': (datetime(2024, 3, 1), datetime(2024, 3, 31)),
    'Apr': (datetime(2024, 4, 1), datetime(2024, 4, 30)),
    'May': (datetime(2024, 5, 1), datetime(2024, 5, 31)),
    'Jun': (datetime(2024, 6, 1), datetime(2024, 6, 30)),
    'Jul': (datetime(2024, 7, 1), datetime(2024, 7, 31)),
    'Aug': (datetime(2024, 8, 1), datetime(2024, 8, 31)),
    'Sep': (datetime(2024, 9, 1), datetime(2024, 9, 30)),
    'Oct': (datetime(2024, 10, 1), datetime(2024, 10, 31)),
    'Nov': (datetime(2024, 11, 1), datetime(2024, 11, 30)),
    'Dec': (datetime(2024, 12, 1), datetime(2024, 12, 31)),
}

In [9]:
with open('reddit-data.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        'subreddit',
        'month',
        'post_id',
        'post_title',
        'post_selftext',
    ] + [
        f'tc{i}' for i in range(10)
    ])
    subreddit = 'wallstreetbets'
    for mo in months.keys():
        mo_start, mo_end = months[mo]

        while 1:
            response = requests.get(
                f'https://api.pullpush.io/reddit/submission/search/?subreddit={subreddit}&after={int(mo_start.timestamp())}&before={int(mo_end.timestamp())}&sort_type=score&size=100'
            )
            if response.status_code == 200:
                break

        posts = response.json()['data']

        for post in posts:
            while 1:
                response = requests.get(
                    f"https://api.pullpush.io/reddit/comment/search/?link_id={post['id']}&sort_type=score&sort=desc&size=10"
                )
                if response.status_code == 200:
                    break


            top10_comments = response.json()['data']
            t10c_bodies = [comment['body'].replace('\n', '') for comment in top10_comments]
            writer.writerow([
                subreddit,
                mo,
                post['id'],
                post['title'],
                post['selftext'].replace('\n', ''),
            ] + t10c_bodies)

            csvfile.flush()



KeyboardInterrupt: 