In [1]:
import datetime
import requests
import json
import re
import time

url = "https://api.pushshift.io/reddit/search"

In [None]:
def fetchObjects(**kwargs):
    
    # Default parameters
    # Change as necessary/desired
    params = {
        "sorted_type": "created_utc",
        "sort": "asc",
        "size": "1000"
        }
    # define mode and pop from kwargs as it is not a valid url parameter    
    mode = kwargs.pop('mode')

    # Add additional parameters based on function arguments
    for key, value in kwargs.items():
        params[key] = value
    
    loop = True
    while loop:
        # Perform API request
        r = requests.get(f'{url}/{mode}/', params=params, timeout=90)
        # print(r.url)
        if r.status_code != 200:
            print(r.status_code)
            print("Retrying...")
        else:
            # successful (200), loop = False and process data
            loop = False
    else:
        response = json.loads(r.text)
        data = response['data']
        sorted_data_by_id = sorted(data, key=lambda x: int(x['id'],36))
        return sorted_data_by_id

In [None]:
def extract_reddit_data(**kwargs):

    kwargs['after'] = int(datetime.datetime.strptime(kwargs['after'], '%d-%b-%Y').timestamp())
    kwargs['before'] = int(datetime.datetime.strptime(kwargs['before'], '%d-%b-%Y').timestamp())

    if 'after' in kwargs:
        print(f"Starting crawl from {datetime.datetime.fromtimestamp(kwargs['after'])}")
    else:
        kwargs['after'] = 0
    
    max_id = 0

    # Open file for JSON output
    filepath = kwargs.pop('filepath')
    file = open(filepath, "a")

    while True: 
        nothing_processed = True
        objects = fetchObjects(**kwargs)
        if kwargs['after'] != '':
            print(f"Retrieving data from {datetime.datetime.fromtimestamp(kwargs['after'])}")
        
        for object in objects:
            id = int(object['id'],36)
            if id > max_id:
                nothing_processed = False
                created_utc = object['created_utc']
                max_id = id
                if created_utc > kwargs['after']:
                    kwargs['after'] = created_utc
                # Output JSON data to the opened file
                file.write(json.dumps(object,sort_keys=True,ensure_ascii=True) + "\n")

        # Exit if nothing happened
        if nothing_processed: return
        kwargs['after'] -= 1

        # Sleep a little before the next function call
        time.sleep(.5)
    
    file.close()

In [None]:
extract_reddit_data(subreddit='coronavirus', mode='submission', after='20-JAN-2020', before='01-FEB-2021', filepath='submissions_dataset.json')

In [None]:
'''
Function to resume crawl from the last row of data.
Ensure that the file path is correct.
'''

# def resumeFromLast(**kwargs):
#     filepath = kwargs['filepath']
#     data = [json.loads(line) for line in open(filepath, 'r')]
#     last_submission_date = data[-1]['created_utc']
#     extract_reddit_data(after=last_submission_date, **kwargs)

In [None]:
'''
Run method with the desired set of parameters
'''
# resumeFromLast(subreddit='coronavirus', mode='comment', before=1609516800, filepath='comments_dataset.json')