In [1]:
import numpy as np
import requests
import os
import json
import csv
import time
import datetime

In [2]:
def get_pushshift_data(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title=' + str(query) + '&size=1000&after=' + str(
        after) + '&before=' + str(before) + '&subreddit=' + str(sub)

    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [3]:
def collect_sub_data(subm):
    sub_data = list()  # list to store data points
    title = subm['title']
    url = subm['url']
    try:
        # if flair is available then get it, else set 'NaN'
        flair = subm['link_flair_text']
    except KeyError:
        flair = 'NaN'
    author = subm['author']
    sub_id = subm['id']
    score = subm['score']
    try:
        # if selftext is available then get it, else set it empty
        selftext = subm['selftext']
        list_of_empty_markers = ['[removed]', '[deleted]']
        # many times selftext would be removed or deleted, if thats the case then set it empty
        if selftext in list_of_empty_markers:
            selftext = ''
    except:
        selftext = ''
    created = datetime.datetime.fromtimestamp(subm['created_utc'])  # 1520561700.0
    numComms = subm['num_comments']
    permalink = subm['permalink']

    sub_data.append((sub_id, title, selftext, url, author, score, created, numComms, permalink, flair))
    sub_stats[sub_id] = sub_data

In [4]:
def write_subs_to_file(filename):
    upload_count = 0
    if os.path.exists(filename):
        keep_header = False
    else:
        keep_header = True

    with open(filename, 'a',encoding='utf-8', newline='') as file:
        a = csv.writer(file, delimiter=',')
        headers = ['post_id', 'title', 'selftext', 'url', 'author', 'score', 'publish_date', 'num_of_comments',
                   'permalink', 'flair']
        if keep_header:
            a.writerow(headers)
        for sub in sub_stats:
            a.writerow(sub_stats[sub][0])
            upload_count += 1
        # print(str(upload_count) + ' submissions have been uploaded')

In [6]:
if __name__ == '__main__':
    sub_reddit = 'Binance'
    key_word = 'Binance'
    output_filename = 'Reddit_data_Binance__.csv'
    start_date = datetime.datetime(2022, 1, 1, 0)
    end_date = datetime.datetime(2022, 3, 31, 0)
    one_day = datetime.timedelta(hours=24)
    after_date = start_date
    after = str(int(after_date.timestamp()))
    before_date = start_date + one_day
    before = str(int(before_date.timestamp()))

    while after_date < end_date:
        sub_count = 0
        sub_stats = {}
        data = get_pushshift_data(key_word, after, before, sub_reddit)
        max_count = 100
        count = 0
        while len(data) > 0 and count < max_count:
            for submission in data:
                collect_sub_data(submission)
                sub_count += 1
            print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
            after = data[-1]['created_utc']
            data = get_pushshift_data(key_word, after, before, sub_reddit)
            count = count + 1
        write_subs_to_file(output_filename)
        after_date += one_day
        after = str(int(after_date.timestamp()))
        before_date += one_day
        before = str(int(before_date.timestamp()))
        time.sleep(np.random.randint(1, 3))

2022-01-01 22:27:32
2022-01-02 23:13:02
2022-01-03 23:15:07
2022-01-04 23:46:15
2022-01-05 23:46:59
2022-01-06 23:46:39
2022-01-07 22:06:57
2022-01-08 23:59:25
2022-01-09 23:52:10
2022-01-10 23:22:44
2022-01-11 22:36:37
2022-01-12 23:24:16
2022-01-13 23:50:38
2022-01-14 23:39:00
2022-01-15 22:57:51
2022-01-16 22:59:10
2022-01-17 23:50:59
2022-01-18 23:45:27
2022-01-19 22:46:11
2022-01-20 23:56:00
2022-01-21 23:42:19
2022-01-22 23:36:52
2022-01-23 23:42:33
2022-01-24 23:32:51
2022-01-25 23:01:14
2022-01-26 23:13:06
2022-01-27 23:39:57
2022-01-28 23:09:13
2022-01-29 23:46:34
2022-01-30 23:47:34
2022-01-31 23:37:57
2022-02-01 23:03:01
2022-02-02 23:11:49
2022-02-03 23:18:27
2022-02-04 23:50:38
2022-02-05 23:04:09
2022-02-06 23:41:20
2022-02-07 23:48:16
2022-02-08 23:41:05
2022-02-09 23:43:59
2022-02-10 23:27:24
2022-02-11 23:42:42
2022-02-12 23:43:00
2022-02-13 23:29:29
2022-02-14 23:19:20
2022-02-15 23:29:08
2022-02-16 23:14:11
2022-02-17 23:51:27
2022-02-18 20:56:16
2022-02-19 23:24:23
