In [54]:
import requests
import json
import time
import datetime

In [53]:
def get_epoch(month, day, year):
    date = datetime.date(month=month, day=day, year=year)
    return int(time.mktime(date.timetuple()))

In [52]:
def post_pushshift_query(sub, query, after, before):
    ps = 'https://api.pushshift.io/reddit/submission/search/?title={query}&after={after}&before={before}&subreddit={sub}'
    r = requests.get(ps.format(sub=sub, query=query, after=after, before=before)) 
    return json.loads(r.text)

In [55]:
g = post_pushshift_query('wallstreetbets', 'TSLA', get_epoch(7, 2, 2020), get_epoch(7, 3, 2020))

In [122]:
def comment_pushshift_query(sub, query, after, before):
    all_comments = []
    last_comment_time = after
    while last_comment_time < before:
        if query is not None:
            ps = 'https://api.pushshift.io/reddit/comment/search/?q={query}&after={after}&before={before}&subreddit={sub}'
        else:
            ps = 'https://api.pushshift.io/reddit/comment/search/?after={after}&before={before}&subreddit={sub}&size=100'
        r = requests.get(ps.format(sub=sub, query=query, after=last_comment_time, before=before)) 
        try:
            d = json.loads(r.text)['data']
            all_comments += d
            last_comment_time = d[-1]['retrieved_on']
        except Exception as e:
            print(e)
            return all_comments
        print(last_comment_time)
    return all_comments

In [125]:
g = comment_pushshift_query('wallstreetbets', 'TSLA', get_epoch(8, 3, 2020), get_epoch(8, 4, 2020))

1596435681
1596451571
1596460572
1596465019
1596467674
1596471649
1596482009
1596496797
1596513317
list index out of range


In [127]:
len(g)

225

- Can we assume that we don't care about the parents of query-containing comments?
- We need an efficient way to collect children comments if their parent mentioned a certain ticker.
- Can probably do a multiprocessing pool for determining what stocks we want to keep track of on HAL, requests are not quick so this will have to be a multiple day process.
- Collecting the comments without redundancy becomes a bit of a graph problem
- https://github.com/dmarx/psaw use this API instead

In [130]:
!pip install psaw

Defaulting to user installation because normal site-packages is not writeable
Collecting psaw
  Downloading psaw-0.0.12-py3-none-any.whl (15 kB)
Installing collected packages: psaw
Successfully installed psaw-0.0.12


In [131]:
from psaw import PushshiftAPI

api = PushshiftAPI()
gen = api.search_comments(q='TSLA', subreddit='wallstreetbets')
max_response_cache = 10000
cache = []
for c in gen:
    cache.append(c)
    if len(cache) >= max_response_cache:
        break


KeyboardInterrupt: 

- Should probably pick our top tickers beforehand, API can be slow to deal with

In [133]:
len(cache)

7700

In [136]:
cache[0].body

"Nah, slow bleed for next couple weeks, will perk up for Battery Day, then continue to bleed down. I don't think they'll drop below $1000 unless their Q3 numbers are completely shit, but I think the TSLA roller-coaster is going to flatten out and become less volatile.\n\nUnless Musk gets caught snorting coke off an underage hooker's ass, then TSLA to the moon."

In [150]:
from collections import defaultdict
mention_dict = defaultdict(int)

def analyze_comment(comment, mention_dict):
    for word in comment.split(' '):
        if (word.isupper() and len(word) < 6 and len(word) > 1):
            mention_dict[word] += 1

In [151]:
%%time
for comment in cache:
    analyze_comment(comment.body, mention_dict)

CPU times: user 114 ms, sys: 6.72 ms, total: 121 ms
Wall time: 118 ms


In [152]:
mention_dict

defaultdict(int,
            {'Q3': 6,
             'TSLA': 5366,
             '$TSLA': 148,
             '0DTE': 44,
             'AMZN': 219,
             'KODK': 32,
             'EOW.': 10,
             'OTM': 64,
             'BABA': 8,
             'TSLA!': 22,
             'NKLA': 100,
             '$GM': 1,
             'TSLA?': 99,
             'AAPL': 125,
             'AMD': 132,
             'SQ': 16,
             'SLV': 63,
             'EOD': 48,
             'USD,': 1,
             'USD': 6,
             'K-': 1,
             'VALUE': 1,
             'AMZN.': 14,
             'THE': 26,
             'NORTH': 1,
             'GLD': 13,
             'SLV.': 6,
             '+4K': 1,
             'LOW': 1,
             'HD': 1,
             'WMT': 17,
             'PTON': 4,
             'MU': 1,
             'NFLX': 17,
             'RTX': 1,
             'TSLA,': 267,
             'EO': 1,
             'US': 18,
             'SPY': 108,
             'SLV,': 22,
          