In [1]:
# Imports

import requests
import pandas as pd
from datetime import datetime
import time

In [2]:
# Final Variables

MAX_POSTS_PER_REQUEST = 100
BASE_URL = "https://api.pullpush.io/reddit/search/submission"
SLEEP_TIMER = 1.0

In [3]:
def fetch_reddit_data(subreddit, score, after, before):
    """
    Fetch JSON data regarding Reddit submissions from PullPush.io
    API (alternative to PushShift after July 1st, 2023).
    
    :param subreddit: subreddit to scrape from
    :param score: minimum submission score to be selected
    :param after: submission must be after this date
    :param before: submission must be before this date
    :return: JSON data
    """
    params = {
        "subreddit": subreddit,
        "size": MAX_POSTS_PER_REQUEST,
        "score": f">{score}",
        "after": after,
        "before": before,
        "sort": "asc"
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        data = response.json()['data']
        # Count the number of gilded submissions
        gilded_count = sum(1 for post in data if post["gilded"] > 0)

        # Extract the desired fields and add the gilded_count
        refined_data = [
            {
                "title": post.get("title", ""),
                "selftext": post.get("selftext", ""),  # Some posts might not have selftext
                "score": post.get("score", ""),
                "num_comments": post.get("num_comments", ""),
                "gilded_count": gilded_count,
                "date": post.get("created_utc", "")
            } for post in data
        ]

        return refined_data
    else:
        print(f"Error {response.status_code}: Unable to fetch data from PullPush.io")
        return []


In [4]:
def fetch_all_posts(subreddit, score, start_date, end_date, update_interval=10):
    """
    Fetch all data from PullPush.io in batches of size=1000.

    :param subreddit: subreddit to scrape from
    :param score: minimum submission score to be selected
    :param start_date: submission must be after this date
    :param end_date: submission must be before this date
    :param update_interval: how often to print status updates to terminal
    :return: array of data
    """
    all_data = []
    next_fetch_timestamp = start_date
    request_count = 0  # Counter to keep track of the number of requests made

    while True:
        print(f"Fetching from {datetime.utcfromtimestamp(next_fetch_timestamp).strftime('%Y-%m-%d %H:%M:%S')} UTC...")  # Diagnostic print

        posts = fetch_reddit_data(subreddit, score, next_fetch_timestamp, end_date)
        
        if not posts:
            print("No posts returned.")  # Diagnostic print
            break

        all_data.extend(posts)
        
        # Increment the request count
        request_count += 1
        if request_count % update_interval == 0:
            print(f"Made {request_count} requests so far. Last fetched post date: {datetime.utcfromtimestamp(next_fetch_timestamp).strftime('%Y-%m-%d %H:%M:%S')} UTC")

        # If we got the maximum number of posts, set the next_fetch_timestamp to the created_utc of the last post + 1 second
        # Else, break out of the loop
        if len(posts) == MAX_POSTS_PER_REQUEST:
            next_fetch_timestamp = posts[-1]['date'] + 1  # Adding 1 second
        else:
            break

        time.sleep(SLEEP_TIMER)  # Respectful delay

    return all_data

In [5]:
subreddit = 'twosentencehorror'
score_threshold = 20
start_date = 1420070400  # January 1, 2015
end_date = int(time.time())  # Current timestamp

all_data = fetch_all_posts(subreddit, score_threshold, start_date, end_date)
df = pd.DataFrame(all_data)

Fetching from 2015-01-01 00:00:00 UTC...
Fetching from 2016-01-17 23:31:34 UTC...
Fetching from 2016-05-04 11:54:51 UTC...
Fetching from 2016-06-27 01:02:45 UTC...
Fetching from 2016-08-23 01:53:34 UTC...
Fetching from 2016-09-20 03:54:35 UTC...
Fetching from 2016-10-22 19:32:49 UTC...
Fetching from 2016-11-25 09:17:05 UTC...
Fetching from 2017-01-03 21:52:53 UTC...
Fetching from 2017-02-15 05:35:57 UTC...
Made 10 requests so far. Last fetched post date: 2017-02-15 05:35:57 UTC
Fetching from 2017-03-19 16:54:37 UTC...
Fetching from 2017-04-25 19:08:46 UTC...
Fetching from 2017-05-29 11:51:29 UTC...
Fetching from 2017-06-26 21:12:45 UTC...
Fetching from 2017-07-19 14:16:53 UTC...
Fetching from 2017-08-15 18:47:58 UTC...
Fetching from 2017-09-02 14:13:58 UTC...
Fetching from 2017-09-24 07:41:30 UTC...
Fetching from 2017-10-15 05:36:53 UTC...
Fetching from 2017-10-31 09:51:57 UTC...
Made 20 requests so far. Last fetched post date: 2017-10-31 09:51:57 UTC
Fetching from 2017-11-16 16:26:48 

In [15]:
# convert Unix time to datetime object
df['timestamp'] = pd.to_datetime(df['date'], unit='s')
# format datetime object to readable date format
df['timestamp'] = df['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

df['date'] = df['date'].astype(int)

In [16]:
df

Unnamed: 0,title,selftext,score,num_comments,gilded_count,date,timestamp
0,Screaming down a well late at night is a dange...,,43,0,0,1423851552,2015-02-13 18:19:12
1,"""Do not expose any part of your body to the air.""","""I repeat..this is not a drill..""",65,5,0,1428090433,2015-04-03 19:47:13
2,"I sometimes remember the way he looked, broken...",I neglected to make sure he was dead.,22,0,0,1428234910,2015-04-05 11:55:10
3,I live alone on the third floor of my apartmen...,So who opens my window every night while I'm s...,35,3,0,1428369882,2015-04-07 01:24:42
4,"I heard the rain hitting my window, so I walke...","My window wasn't wet, but the glass was covere...",28,3,0,1428385255,2015-04-07 05:40:55
...,...,...,...,...,...,...,...
107495,As I look thru at window I see something inhumane,My reflection helps me remember how well my su...,31,2,0,1680376914,2023-04-01 19:21:54
107496,I’ve always been passionate about conspiracy t...,"So when my wife had twins, I knew exactly what...",27,8,0,1680377095,2023-04-01 19:24:55
107497,"""You'll see me on the red carpet one day,"" sai...","So I paid her a surprise visit, and upon walki...",23,2,0,1680377883,2023-04-01 19:38:03
107498,I could hear my sister screaming nearby as I s...,But my heart sank when I remembered the monste...,60,3,0,1680378061,2023-04-01 19:41:01


In [11]:
print(len(df))

107500


In [18]:
df.to_csv('reddit_scrape_20_Jan2015_timestamp2.csv')
# df.to_parquet('reddit_scrape_20_Jan2015_timestamp.gzip', compression='gzip')
