In [1]:
# Installing dependencies
#!pip install praw
#!pip install psaw
#!pip install yfinance

In [2]:
import praw
from psaw import PushshiftAPI
import json
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta

# Suppress warning messages
import warnings
#warnings.filterwarnings('ignore')

In [3]:
# Load client_id, secret_id, and user_agent
with open('info.json') as f:
     info = json.load(f)
        
info = dict(info)

In [4]:
# Initialize Reddit and PushshiftAPI instances
reddit = praw.Reddit(client_id=info["client_id"], user_agent=info["user_agent"], client_secret=info["client_secret"])
api = PushshiftAPI(reddit)

In [5]:
# Store results of a search in a DataFrame
subm_dicts = [{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in api.search_submissions(subreddit='stocks', q="TWTR", filter=['url','author', 'title', 'subreddit'], limit=100)]
df = pd.DataFrame(subm_dicts)
df



Unnamed: 0,comment_limit,comment_sort,_reddit,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,...,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,_fetched,_comments_by_id,link_flair_template_id
0,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,#Good morning traders and investors of the r/s...,t2_eaak0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/txlkg...,3856752,1.649250e+09,0,,False,False,{},
1,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_4yrm7th0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/txef6...,3856752,1.649221e+09,0,,False,False,{},866eb162-65e6-11e5-a903-1252b640afe9
2,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_3u5upg0j,False,,0,...,False,https://www.reddit.com/r/stocks/comments/twykl...,3856752,1.649175e+09,0,,False,False,{},5a4c814a-65e6-11e5-b65e-122ab0778f8b
3,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,#Good morning traders and investors of the r/s...,t2_eaak0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/twts1...,3856752,1.649162e+09,0,,False,False,{},
4,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_153n7s,False,,0,...,False,https://www.reddit.com/r/stocks/comments/tw47e...,3856752,1.649085e+09,0,,False,False,{},5a4c814a-65e6-11e5-b65e-122ab0778f8b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_2me1cihr,False,,0,...,False,https://www.reddit.com/r/stocks/comments/n1w56...,3856752,1.619796e+09,0,,False,False,{},
96,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_46aro24r,False,,0,...,False,https://www.reddit.com/r/stocks/comments/n0tdi...,3856752,1.619658e+09,0,,False,False,{},
97,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,"**PsychoMarket Recap - Monday, April 26, 2021*...",t2_7gtjd4c0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/mz8dh...,3856752,1.619472e+09,1,,False,False,{},
98,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,"**PsychoMarket Recap - Thursday, April 22, 202...",t2_7gtjd4c0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/mwegi...,3856752,1.619125e+09,1,,False,False,{},


In [6]:
# Get historical stock data for a ticker
twtr = yf.download('TWTR', progress=True)
twtr

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-11-07,45.099998,50.090000,44.000000,44.900002,44.900002,117701600
2013-11-08,45.930000,46.939999,40.689999,41.650002,41.650002,27925300
2013-11-11,40.500000,43.000000,39.400002,42.900002,42.900002,16113900
2013-11-12,43.660000,43.779999,41.830002,41.900002,41.900002,6316700
2013-11-13,41.029999,42.869999,40.759998,42.599998,42.599998,8688300
...,...,...,...,...,...,...
2022-03-31,39.110001,39.230000,38.410000,38.689999,38.689999,13208300
2022-04-01,39.160000,39.849998,39.000000,39.310001,39.310001,12122600
2022-04-04,47.869999,51.369999,46.860001,49.970001,49.970001,268465400
2022-04-05,53.849998,54.570000,50.560001,50.980000,50.980000,217520100


## Processing Tickers

In [7]:
# At close, calculate the real and percent change since last close
def get_diff(ticker_data):
    df = ticker_data.copy()
    real = []
    percent = []
    for index, row in df.reset_index().iterrows():
        if(index == 0):
            real.append(0)
            percent.append(0)
        else:
            real.append(row["Close"]-df.iloc[index-1]["Close"])
            percent.append(real[-1]/df.iloc[index-1]["Close"])
    return real, percent

In [8]:
# Get the reddit posts that mention a certain ticker n days before a large change in stock price
def get_pre_change_posts(ticker, ticker_gain, days=1, limit=100, subreddit="stocks"):
    df = None
    for index, row in ticker_gain.iterrows():
        start_date = datetime.fromtimestamp(row.name.timestamp()) + timedelta(hours=6, days=-days)
        end_date = datetime.fromtimestamp(row.name.timestamp()) + timedelta(hours=6)
        
        # TODO: Check whether comments would be better than submissions
        
        submissions = api.search_comments(after=start_date, before=end_date, q=ticker, subreddit=subreddit, filter=['url','author', 'title', 'subreddit'], limit=limit)
        if(df is None):
            df = pd.DataFrame([{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in submissions])
        else:
            df = df.append([{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in submissions], ignore_index=True)
    return df

In [9]:
# Generate information for a given ticker
def process_ticker(ticker, gain_cutoff=0.05, loss_cutoff=0.05, limit=100, days=1):
    ticker_data = yf.download(ticker, progress=False)
    real, percent = get_diff(twtr)
    ticker_data["Real_Change"] = real
    ticker_data["Percent_Change"] = percent
    ticker_gain = ticker_data[ticker_data["Percent_Change"] > gain_cutoff]
    ticker_loss = ticker_data[ticker_data["Percent_Change"] < -loss_cutoff]
    
    pre_gain = get_pre_change_posts(ticker, ticker_gain, days, limit)
    pre_loss = get_pre_change_posts(ticker, ticker_loss, days, limit)
    return ticker_gain, ticker_loss, pre_gain, pre_loss

In [10]:
gain, loss, pre_gain, pre_loss = process_ticker("TWTR", limit=100)



In [11]:
gain

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Real_Change,Percent_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-12-04,41.270000,43.919998,41.270000,43.689999,43.689999,11014900,2.320000,0.056079
2013-12-09,45.590000,49.840000,45.020000,49.139999,49.139999,17366600,4.189999,0.093215
2013-12-10,48.900002,52.580002,48.700001,51.990002,51.990002,25792000,2.850002,0.057998
2013-12-12,52.200001,55.869999,50.689999,55.330002,55.330002,23446900,2.990002,0.057127
2013-12-13,56.200001,59.410000,55.450001,59.000000,59.000000,38979600,3.669998,0.066329
...,...,...,...,...,...,...,...,...
2022-02-09,36.500000,37.919998,36.139999,37.830002,37.830002,24473500,1.850002,0.051418
2022-02-24,31.299999,35.070000,31.299999,34.980000,34.980000,22551000,2.220001,0.067766
2022-03-09,33.549999,34.529999,33.299999,34.369999,34.369999,16886000,1.689999,0.051714
2022-03-17,35.209999,37.709999,34.889999,37.299999,37.299999,30853100,1.930000,0.054566


In [12]:
loss

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Real_Change,Percent_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-11-08,45.930000,46.939999,40.689999,41.650002,41.650002,27925300,-3.250000,-0.072383
2013-11-18,43.500000,43.950001,40.849998,41.139999,41.139999,12810600,-2.840000,-0.064575
2013-12-27,70.099998,71.250000,63.689999,63.750000,63.750000,60418700,-9.559998,-0.130405
2013-12-30,60.270000,63.709999,58.570000,60.509998,60.509998,55538300,-3.240002,-0.050824
2014-01-07,67.669998,67.730003,61.389999,61.459999,61.459999,31748400,-4.830002,-0.072862
...,...,...,...,...,...,...,...,...
2021-04-30,56.000000,57.630001,55.049999,55.220001,55.220001,88378800,-9.869995,-0.151636
2021-10-04,61.040001,61.160000,57.639999,58.389999,58.389999,17381300,-3.590000,-0.057922
2021-10-27,60.049999,60.160000,54.790001,54.810001,54.810001,48107700,-6.619999,-0.107765
2022-01-21,36.900002,37.080002,34.799999,34.820000,34.820000,25674100,-2.459999,-0.065987


In [13]:
pre_gain

Unnamed: 0,_replies,_submission,_reddit,subreddit_id,approved_at_utc,author_is_blocked,comment_type,edited,mod_reason_by,banned_by,...,treatment_tags,created_utc,subreddit_name_prefixed,controversiality,author_flair_background_color,collapsed_because_crowd_control,mod_reports,mod_note,distinguished,_fetched
0,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1386791000.0,r/stocks,0,,,[],,,True
1,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1387820000.0,r/stocks,0,,,[],,,True
2,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1387819000.0,r/stocks,0,,,[],,,True
3,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1387784000.0,r/stocks,0,,,[],,,True
4,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1388418000.0,r/stocks,0,,,[],,,True
5,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1391725000.0,r/stocks,0,,,[],,,True
6,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1391662000.0,r/stocks,0,,,[],,,True
7,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1406673000.0,r/stocks,0,,,[],,,True
8,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1406671000.0,r/stocks,0,,,[],,,True
9,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,False,,,...,[],1406668000.0,r/stocks,0,,,[],,,True


In [14]:
pre_loss

Unnamed: 0,_replies,_submission,_reddit,subreddit_id,approved_at_utc,author_is_blocked,comment_type,edited,mod_reason_by,banned_by,...,treatment_tags,created_utc,subreddit_name_prefixed,controversiality,author_flair_background_color,collapsed_because_crowd_control,mod_reports,mod_note,distinguished,_fetched
0,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383867000.0,r/stocks,0,,,[],,,True
1,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383858000.0,r/stocks,0,,,[],,,True
2,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383855000.0,r/stocks,0,,,[],,,True
3,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383851000.0,r/stocks,0,,,[],,,True
4,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383851000.0,r/stocks,0,,,[],,,True
5,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383849000.0,r/stocks,0,,,[],,,True
6,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383843000.0,r/stocks,0,,,[],,,True
7,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383840000.0,r/stocks,0,,,[],,,True
8,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1383840000.0,r/stocks,0,,,[],,,True
9,[],,<praw.reddit.Reddit object at 0x00000231DB56E070>,t5_2qjfk,,False,,0,,,...,[],1391641000.0,r/stocks,0,,,[],,,True


In [15]:
list(pre_gain.columns)

['_replies',
 '_submission',
 '_reddit',
 'subreddit_id',
 'approved_at_utc',
 'author_is_blocked',
 'comment_type',
 'edited',
 'mod_reason_by',
 'banned_by',
 'ups',
 'num_reports',
 'author_flair_type',
 'total_awards_received',
 'subreddit',
 'author_flair_template_id',
 'likes',
 'user_reports',
 'saved',
 'id',
 'banned_at_utc',
 'mod_reason_title',
 'gilded',
 'archived',
 'collapsed_reason_code',
 'no_follow',
 'author',
 'can_mod_post',
 'send_replies',
 'parent_id',
 'score',
 'author_fullname',
 'report_reasons',
 'removal_reason',
 'approved_by',
 'all_awardings',
 'body',
 'awarders',
 'top_awarded_type',
 'downs',
 'author_flair_css_class',
 'author_patreon_flair',
 'collapsed',
 'author_flair_richtext',
 'is_submitter',
 'body_html',
 'gildings',
 'collapsed_reason',
 'associated_award',
 'stickied',
 'author_premium',
 'can_gild',
 'link_id',
 'unrepliable_reason',
 'author_flair_text_color',
 'score_hidden',
 'permalink',
 'subreddit_type',
 'locked',
 'name',
 'create

In [60]:
def word_counts(df, column="body", min_letters=3):
    counts = {}
    for i in list(df[column]):
        for j in i.split(" "):
            j = ''.join(k for k in j if k.isalnum())
            # Exclude words that are likely tickers
            if(j == j.upper() and len(j) > 1 and len(j) <= 5):
                pass
            elif(len(j) < 3):
                pass
            elif(j not in counts.keys()):
                counts[j.lower()] = 1
            else:
                counts.update({j.lower():counts.get(j.lower())+1})
    return counts

In [66]:
def remove_shared_keys(dict_a, dict_b):
    a = dict_a.copy()
    b = dict_b.copy()
    
    to_remove = []
    for i in a:
        if(i in b):
            to_remove.append(i)
    for i in to_remove:
        a.pop(i)
        b.pop(i)
    return a, b

In [67]:
gain_wc = dict(sorted(word_counts(pre_gain).items(), key=lambda x: x[1], reverse=True))

In [68]:
loss_wc = dict(sorted(word_counts(pre_loss).items(), key=lambda x: x[1], reverse=True))

In [63]:
gain

In [64]:
loss_wc

{'likely': 4,
 'made': 4,
 'mind': 4,
 'size': 3,
 'morning': 3,
 'rate': 3,
 'deleted': 3,
 'order': 3,
 'funds': 3,
 'new': 3,
 'through': 3,
 'season': 2,
 'type': 2,
 'acquisition': 2,
 'stay': 2,
 'hike': 2,
 'free': 2,
 'plan': 2,
 'httpfinanceyahoocomqstwtrql1': 2,
 'app': 2,
 'first': 2,
 'losing': 2,
 'trouble': 2,
 'invested': 2,
 'sign': 2,
 'decision': 2,
 'panic': 2,
 'isnt': 2,
 'makes': 2,
 'dip': 2,
 'beat': 2,
 'rumors': 2,
 'resistance': 2,
 'momentum': 2,
 'shot': 2,
 'base': 2,
 'crm': 2,
 'zlcs': 1,
 'zgnx': 1,
 'watch': 1,
 'carrefully': 1,
 'mix': 1,
 'retailer': 1,
 'safetydecent': 1,
 'gain': 1,
 'spending': 1,
 'suggestions': 1,
 'playing': 1,
 'anytime': 1,
 'soonmy': 1,
 'worthwhile': 1,
 'pick': 1,
 'dealunlike': 1,
 'twtrs': 1,
 'world': 1,
 'bleeding': 1,
 'mopub': 1,
 'adsense': 1,
 'twitterthey': 1,
 'monetizing': 1,
 'fleeting': 1,
 'knows': 1,
 'pivot': 1,
 'relevant': 1,
 'tts': 1,
 'estimate': 1,
 'normally': 1,
 'fast': 1,
 'traders': 1,
 'sensitiv

In [65]:
gain_wc

{'much': 9,
 'wrong': 8,
 'blue': 6,
 'companies': 6,
 'few': 6,
 'any': 5,
 'selling': 5,
 'looks': 5,
 'last': 5,
 'used': 4,
 'less': 4,
 'actually': 4,
 'hasnt': 3,
 'find': 3,
 'cheap': 3,
 'shorted': 3,
 'risk': 3,
 'great': 3,
 'far': 3,
 'opportunities': 3,
 'either': 3,
 'chip': 3,
 'mostly': 3,
 'started': 3,
 'lower': 3,
 'chips': 3,
 'per': 3,
 'oil': 3,
 'portfolio': 3,
 'until': 3,
 'report': 3,
 'our': 3,
 'upon': 3,
 'profitable': 3,
 'called': 2,
 'check': 2,
 'works': 2,
 'let': 2,
 'borrowed': 2,
 'prices': 2,
 'havent': 2,
 'there': 2,
 'enjoy': 2,
 'willing': 2,
 'fire': 2,
 'plenty': 2,
 'charts': 2,
 'product': 2,
 'niche': 2,
 'perspective': 2,
 'business': 2,
 'strategies': 2,
 'low': 2,
 'use': 2,
 'current': 2,
 'generate': 2,
 'end': 2,
 'favor': 2,
 'safe': 2,
 'reading': 2,
 'happy': 2,
 'seeing': 2,
 'upside': 2,
 'talking': 2,
 'lows': 2,
 'absolutely': 2,
 'managed': 2,
 'year': 2,
 'shit': 2,
 'reasons': 2,
 'removed': 2,
 'dividends': 2,
 'released': 