In [77]:
import pandas as pd
import requests
import json
import time
import datetime as dt
from datetime import date, timedelta, timezone, datetime

# for repeating a specific item as needed during multithreaded requests
from itertools import repeat

# use for speeding up I/O operations ie. http requests
import concurrent.futures

In [78]:
# use to map a date to a list of posts
date_with_posts = {}

In [79]:
def getData(start_end_time, sub):
    global date_with_posts
    
    url = (
        f'https://api.pushshift.io/reddit/search/submission/?fields=title&sort_type=score&sort=desc&size=10'
        f'&after={str(start_end_time[0])}'
        f'&before={str(start_end_time[1])}'
        f'&subreddit={str(sub)}'
    )
    result_string = requests.get(url)
    try:
        result = result_string.json()
    except Exception:
        time.sleep(10)
        while(result_string.status_code != 200):
            result_string = requests.get(url)
        result = result_string.json()
    
    post_titles = [post['title'] for post in result['data']]
    
    date_key = datetime.fromtimestamp(start_end_time[0])

#     date_with_posts[datetime.fromtimestamp(start_end_time[0])] = posts
    # put the post titles in the dictionary
    if date_key in date_with_posts:
        date_with_posts[date_key].extend(post_titles)
    else:
        date_with_posts[date_key] = post_titles

In [80]:
def get_top_daily_comments(*args):
    '''gets comments for a specified day from accross all reddit'''
    
    base_url = "https://api.pushshift.io/reddit/search/comment/"
    query = args[0]
    after = args[1][0]
    before = args[1][1]
    num_comments = args[2]
    sort_type = args[3]
    sort = args[4]

    # information for the data you want to get from the request
    payload = {
        'q': query,
        'after': after,
        'before': before,
        'size': num_comments,
        'sort_type': sort_type,
        'sort': sort
    }

    # get the data for the comments
    request = requests.get(base_url, params=payload)
    
    # make sure to get the data in the case that too many requests at once
    try:
        request_data = request.json()
    except Exception:
        time.sleep(10)
        while(request.status_code != 200):
            request = requests.get(base_url, params=payload)
        request_data = request.json() 

    # get the comments
    comments = [comment['body'] for comment in request_data['data']]
    
    date_key = datetime.fromtimestamp(after)

    # add the comments to the dictionary of date->post titles
    if date_key in date_with_posts:
        date_with_posts[date_key].extend(comments)
    else:
        date_with_posts[date_key] = comments

In [81]:
start_date = date(2021, 1, 1)
end_date = date(2021, 7, 25)
difference = end_date - start_date
delta = timedelta(days=1)

# create a list of days for parallel requests
date_list = [end_date - timedelta(days=day) for day in range(difference.days + 1)]

# reverse the date list so January 1 is the first element
date_list.reverse()

# create a list of start/end times for gathering daily posts
# start is in position 0, end is position 1
start_end_times = [
    [
        int(datetime.combine(day, dt.time(0, 0)).timestamp()),
        int(datetime.combine(day, dt.time(23, 59, 59)).timestamp())
    ]
    for day in date_list
]

In [82]:
# %%time
# # multithreaded requests for post titles on GME subreddit
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     executor.map(getData, start_end_times, repeat('GME'))

In [83]:
# %%time
# # multithreaded requests for post titles on GME subreddit
# with concurrent.futures.ThreadPoolExecutor() as executor:
#     executor.map(getData, start_end_times, repeat('superstonk'))

In [84]:
# %%time
# # multithreaded requests across reddit for comments about GME
# with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
#     executor.map(get_top_daily_comments,
#         repeat('GME'), # query reddit comments for substring 'GME'
#         start_end_times, # dates to check comments
#         repeat(20), # maximum number of comments to get for that day
#         repeat('score'), # get comments based on number of upvotes
#         repeat('desc') # order comments in descending order of upvotes
#     )

In [85]:
%%time
# multithreaded request from...
# post titles from r/GME and r/superstonk
# and comments about GME from accross reddit
with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
    executor.map(getData, start_end_times, repeat('GME'))
    executor.map(getData, start_end_times, repeat('superstonk'))
    executor.map(get_top_daily_comments,
        repeat('GME'), # query reddit comments for substring 'GME'
        start_end_times, # dates to check comments
        repeat(20), # maximum number of comments to get for that day
        repeat('score'), # get comments based on number of upvotes
        repeat('desc') # order comments in descending order of upvotes
    )

CPU times: user 20.8 s, sys: 1.44 s, total: 22.2 s
Wall time: 12min 24s


In [86]:
date_with_posts

{datetime.datetime(2021, 1, 1, 0, 0): ['This is good news for $gme.  Call it is!',
  'I laughed out loud\n\nEdit: GME 🚀💎🙌',
  '“ I love to be challenged, and I’m flexible on details, but I’m never willing to give up.” Ryan Cohen\n\nIm with Cohen and gme gang all the way!',
  "I'm coming to believe these mass awardings within GME posts and comments are not arbitrary, and there are actually people out there who are simply trying to do the right thing and desensitize us to being awarded/rewarded so when the day comes and we wake up to 420.69/a share,  we don't die of shock",
  "Imagine being on the Titanic and watching her slowly sink? Melvin Capital's GME position in 2021. The best part is when the ship splits in half and goes down all at once. Share recalls.",
  'I like this theory from u/redcedar53 -\n\nI think that 3% (buy on to 9.8% ownership) was just to test the waters to see how the market would react. It jumping to 25% upon news and holding it there for 2 weeks, that’s good enoug

In [87]:
# get the post/comment values from date_with_posts
check_nums_elements = date_with_posts.values()

# create lists counting the number of posts/comments each day
check_nums_elements_len = [len(el) for el in check_nums_elements]

# see how many days have zero post titles and comments about GME
count = 0
for thing in check_nums_elements_len:
    if thing == 0:
        count +=1
print(count)

17


# export the data as pickle file

In [88]:
import pickle

with open('date_with_posts.pickle', 'wb') as handle:
    pickle.dump(date_with_posts, handle, protocol=pickle.HIGHEST_PROTOCOL)

# instructions for importing the data

In [None]:
# https://stackoverflow.com/questions/11218477/how-can-i-use-pickle-to-save-a-dict
# import pickle

# with open('date_with_posts.pickle', 'rb') as handle:
#     data = pickle.load(handle)

# print(data)