In [1]:
import numpy as np
import requests
import json
import csv
import time
import datetime
import os

In [2]:
def get_pushshift_data(query, after, before, sub):
    url = 'https://api.pushshift.io/reddit/search/submission/?title=' + str(query) + '&size=1000&after=' + str(
        after) + '&before=' + str(before) + '&subreddit=' + str(sub)
    print(url)
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

In [3]:
def collect_sub_data(subm):
    sub_data = list()  # list to store data points
    title = subm['title']
    url = subm['url']
    try:
        # if flair is available then get it, else set 'NaN'
        flair = subm['link_flair_text']
    except KeyError:
        flair = 'NaN'
    author = subm['author']
    sub_id = subm['id']
    score = subm['score']
    try:
        # if selftext is available then get it, else set it empty
        selftext = subm['selftext']
        list_of_empty_markers = ['[removed]', '[deleted]']
        # many times selftext would be removed or deleted, if thats the case then set it empty
        if selftext in list_of_empty_markers:
            selftext = ''
    except:
        selftext = ''
    created = datetime.datetime.fromtimestamp(subm['created_utc'])  # 1520561700.0
    numComms = subm['num_comments']
    permalink = subm['permalink']

    sub_data.append((sub_id, title, selftext, url, author, score, created, numComms, permalink, flair))
    sub_stats[sub_id] = sub_data

In [4]:
def write_subs_to_file(filename):
    upload_count = 0
    if os.path.exists(filename):
        keep_header = False
    else:
        keep_header = True

    with open(filename, 'a',encoding='utf-8', newline='') as file:
        a = csv.writer(file, delimiter=',')
        headers = ['post_id', 'title', 'selftext', 'url', 'author', 'score', 'publish_date', 'num_of_comments',
                   'permalink', 'flair']
        if keep_header:
            a.writerow(headers)
        for sub in sub_stats:
            a.writerow(sub_stats[sub][0])
            upload_count += 1
        # print(str(upload_count) + ' submissions have been uploaded')

In [7]:
if __name__ == '__main__':
    # Download reddit posts from sub_reddit with keywords given by key_word

    sub_reddit = 'Bitcoin'
    key_word = 'Bitcoin'

    output_filename = 'reddit_data_Bitcoin.csv'
    # search all the posts from start_date to end_date overall
    start_date = datetime.datetime(2022, 1, 1, 0)
    end_date = datetime.datetime(2022, 3, 31, 0)

    # in each itration get reddit posts for one day, to avoid getting blocked by server
    one_day = datetime.timedelta(hours=24)
    after_date = start_date
    after = str(int(after_date.timestamp()))
    before_date = start_date + one_day
    before = str(int(before_date.timestamp()))

    while after_date < end_date:
        print('-' * 80)
        print(after_date, ' -> ', before_date)
        print('-' * 80)

        sub_count = 0
        sub_stats = {}

        data = get_pushshift_data(key_word, after, before, sub_reddit)

        max_count = 100
        count = 0
        while len(data) > 0 and count < max_count:
            print('count ', count)
            for submission in data:
                collect_sub_data(submission)
                sub_count += 1

            print(len(data))
            print(str(datetime.datetime.fromtimestamp(data[-1]['created_utc'])))
            after = data[-1]['created_utc']
            data = get_pushshift_data(key_word, after, before, sub_reddit)
            # print(data)
            # print(data['data'][0]['author'])
            count = count + 1

        # keep saving data collected in each iteration
        write_subs_to_file(output_filename)

        # move to next day
        after_date += one_day
        after = str(int(after_date.timestamp()))
        before_date += one_day
        before = str(int(before_date.timestamp()))

        # randomly sleep before starting next iteration
        time.sleep(np.random.randint(1, 3))

--------------------------------------------------------------------------------
2022-01-01 00:00:00  ->  2022-01-02 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1640975400&before=1641061800&subreddit=Bitcoin
count  0
55
2022-01-01 23:33:09
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1641060189&before=1641061800&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-01-02 00:00:00  ->  2022-01-03 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1641061800&before=1641148200&subreddit=Bitcoin
count  0
74
2022-01-02 23:57:39
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1641148059&before=1641148200&subreddit=Bitcoin
----------

count  0
80
2022-01-15 23:57:46
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1642271266&before=1642271400&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-01-16 00:00:00  ->  2022-01-17 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1642271400&before=1642357800&subreddit=Bitcoin
count  0
67
2022-01-16 23:44:11
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1642356851&before=1642357800&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-01-17 00:00:00  ->  2022-01-18 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1642357800&before=1642444200&subreddit=Bitcoin
count  0
6

--------------------------------------------------------------------------------
2022-01-30 00:00:00  ->  2022-01-31 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1643481000&before=1643567400&subreddit=Bitcoin
count  0
89
2022-01-30 23:50:51
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1643566851&before=1643567400&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-01-31 00:00:00  ->  2022-02-01 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1643567400&before=1643653800&subreddit=Bitcoin
count  0
74
2022-01-31 23:51:37
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1643653297&before=1643653800&subreddit=Bitcoin
----------

count  0
81
2022-02-14 23:45:07
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1644862507&before=1644863400&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-02-15 00:00:00  ->  2022-02-16 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1644863400&before=1644949800&subreddit=Bitcoin
count  0
92
2022-02-15 23:50:19
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1644949219&before=1644949800&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-02-16 00:00:00  ->  2022-02-17 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1644949800&before=1645036200&subreddit=Bitcoin
count  0
8

count  0
100
2022-03-02 23:25:33
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1646243733&before=1646245800&subreddit=Bitcoin
count  1
1
2022-03-02 23:42:32
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1646244752&before=1646245800&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-03-03 00:00:00  ->  2022-03-04 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1646245800&before=1646332200&subreddit=Bitcoin
count  0
79
2022-03-03 23:31:45
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1646330505&before=1646332200&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-03-04 00:00:00  ->  2022-03-05 00:00:00
-----------------------------------------------------------

count  0
70
2022-03-18 23:43:30
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1647627210&before=1647628200&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-03-19 00:00:00  ->  2022-03-20 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1647628200&before=1647714600&subreddit=Bitcoin
count  0
56
2022-03-19 23:49:27
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1647713967&before=1647714600&subreddit=Bitcoin
--------------------------------------------------------------------------------
2022-03-20 00:00:00  ->  2022-03-21 00:00:00
--------------------------------------------------------------------------------
https://api.pushshift.io/reddit/search/submission/?title=Bitcoin&size=1000&after=1647714600&before=1647801000&subreddit=Bitcoin
count  0
3