In [1]:
# Installing dependencies
#!pip install praw
#!pip install psaw
#!pip install yfinance

In [2]:
import praw
from psaw import PushshiftAPI
import json
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta

# Suppress warning messages
import warnings
#warnings.filterwarnings('ignore')

In [3]:
# Load client_id, secret_id, and user_agent
with open('info.json') as f:
     info = json.load(f)
        
info = dict(info)

In [4]:
# Initialize Reddit and PushshiftAPI instances
reddit = praw.Reddit(client_id=info["client_id"], user_agent=info["user_agent"], client_secret=info["client_secret"])
api = PushshiftAPI(reddit)

In [5]:
# Store results of a search in a DataFrame
subm_dicts = [{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in api.search_submissions(subreddit='stocks', q="TWTR", filter=['url','author', 'title', 'subreddit'], limit=100)]
df = pd.DataFrame(subm_dicts)
df



Unnamed: 0,comment_limit,comment_sort,_reddit,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,...,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,_fetched,_comments_by_id,link_flair_template_id
0,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,#Good morning traders and investors of the r/s...,t2_eaak0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/txlkg...,3856752,1.649250e+09,0,,False,False,{},
1,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_4yrm7th0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/txef6...,3856752,1.649221e+09,0,,False,False,{},866eb162-65e6-11e5-a903-1252b640afe9
2,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_3u5upg0j,False,,0,...,False,https://www.reddit.com/r/stocks/comments/twykl...,3856752,1.649175e+09,0,,False,False,{},5a4c814a-65e6-11e5-b65e-122ab0778f8b
3,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,#Good morning traders and investors of the r/s...,t2_eaak0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/twts1...,3856752,1.649162e+09,0,,False,False,{},
4,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_153n7s,False,,0,...,False,https://www.reddit.com/r/stocks/comments/tw47e...,3856752,1.649085e+09,0,,False,False,{},5a4c814a-65e6-11e5-b65e-122ab0778f8b
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_2me1cihr,False,,0,...,False,https://www.reddit.com/r/stocks/comments/n1w56...,3856752,1.619796e+09,0,,False,False,{},
96,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,[removed],t2_46aro24r,False,,0,...,False,https://www.reddit.com/r/stocks/comments/n0tdi...,3856752,1.619658e+09,0,,False,False,{},
97,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,"**PsychoMarket Recap - Monday, April 26, 2021*...",t2_7gtjd4c0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/mz8dh...,3856752,1.619472e+09,1,,False,False,{},
98,2048,confidence,<praw.reddit.Reddit object at 0x00000231DB56E070>,,stocks,"**PsychoMarket Recap - Thursday, April 22, 202...",t2_7gtjd4c0,False,,0,...,False,https://www.reddit.com/r/stocks/comments/mwegi...,3856752,1.619125e+09,1,,False,False,{},


In [6]:
# Get historical stock data for a ticker
twtr = yf.download('TWTR', progress=True)
twtr

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-11-07,45.099998,50.090000,44.000000,44.900002,44.900002,117701600
2013-11-08,45.930000,46.939999,40.689999,41.650002,41.650002,27925300
2013-11-11,40.500000,43.000000,39.400002,42.900002,42.900002,16113900
2013-11-12,43.660000,43.779999,41.830002,41.900002,41.900002,6316700
2013-11-13,41.029999,42.869999,40.759998,42.599998,42.599998,8688300
...,...,...,...,...,...,...
2022-03-31,39.110001,39.230000,38.410000,38.689999,38.689999,13208300
2022-04-01,39.160000,39.849998,39.000000,39.310001,39.310001,12122600
2022-04-04,47.869999,51.369999,46.860001,49.970001,49.970001,268465400
2022-04-05,53.849998,54.570000,50.560001,50.980000,50.980000,217520100


## Processing Tickers

In [7]:
# At close, calculate the real and percent change since last close
def get_diff(ticker_data):
    df = ticker_data.copy()
    real = []
    percent = []
    for index, row in df.reset_index().iterrows():
        if(index == 0):
            real.append(0)
            percent.append(0)
        else:
            real.append(row["Close"]-df.iloc[index-1]["Close"])
            percent.append(real[-1]/df.iloc[index-1]["Close"])
        print(index)
    return real, percent

In [99]:
# Get the reddit posts that mention a certain ticker n days before a large change in stock price
def get_pre_change_posts(ticker, ticker_gain, days=1, limit=1000, subreddit="stocks,stockmarket,stocksandtrading,daytrading,investing,stocks_picks,stockstobuytoday"):
    df = None
    for index, row in ticker_gain.iterrows():
        start_date = datetime.fromtimestamp(row["Date"].timestamp()) + timedelta(hours=6, days=-days)
        end_date = datetime.fromtimestamp(row["Date"].timestamp()) + timedelta(hours=6)
        
        # TODO: Check whether comments would be better than submissions
        
        submissions = api.search_comments(after=start_date, before=end_date, q=ticker, subreddit=subreddit, filter=['url','author', 'title', 'subreddit'], limit=limit)
        if(df is None):
            df = pd.DataFrame([{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in submissions])
        else:
            df = df.append([{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in submissions], ignore_index=True)
    return df

In [100]:
def word_counts(df, column="body", min_letters=3):
    counts = {}
    for i in list(df[column]):
        for j in i.split(" "):
            j = ''.join(k for k in j if k.isalnum())
            # Exclude words that are likely tickers
            if(j == j.upper() and len(j) > 1 and len(j) <= 5):
                pass
            elif(len(j) < 3):
                pass
            elif(j not in counts.keys()):
                counts[j.lower()] = 1
            else:
                counts.update({j.lower():counts.get(j.lower())+1})
    return counts

In [131]:
def remove_shared_keys(dict_a, dict_b, cutoff=2):
    a = dict_a.copy()
    b = dict_b.copy()
    
    rm_a = []
    rm_b = []
    for i in a:
        if(i in b):
            if(b.get(i) > 2*a.get(i)):
                rm_a.append(i)
            elif(b.get(i) < 2*a.get(i)):
                rm_b.append(i)
            else:
                rm_a.append(i)
                rm_b.append(i)
    for i in rm_a:
        a.pop(i)
    for i in rm_b:
        b.pop(i)
    return a, b

In [102]:
def remove_infrequent_words(dict_a, min_count=2):
    d = dict_a.copy()
    
    to_remove = []
    for i, x in d.items():
        if(x < min_count):
            to_remove.append(i)
            
    for i in to_remove:
        d.pop(i)
        
    return d

In [137]:
# Generate information for a given ticker
def process_ticker(ticker, gain_cutoff=0.05, loss_cutoff=0.05, limit=100, days=1):
    try:
        ticker_data = yf.download(ticker, progress=False)
        ticker_data.reset_index(inplace=True)
        real, percent = get_diff(ticker_data)

        ticker_data["Real_Change"] = real
        ticker_data["Percent_Change"] = percent

        ticker_gain = ticker_data[ticker_data["Percent_Change"] > gain_cutoff]
        ticker_loss = ticker_data[ticker_data["Percent_Change"] < -loss_cutoff]

        pre_gain = get_pre_change_posts(ticker, ticker_gain, days, limit)
        pre_loss = get_pre_change_posts(ticker, ticker_loss, days, limit)

        gain_wc = dict(sorted(word_counts(pre_gain).items(), key=lambda x: x[1], reverse=True))
        loss_wc = dict(sorted(word_counts(pre_loss).items(), key=lambda x: x[1], reverse=True))

        gain_freq = remove_infrequent_words(gain_wc)
        loss_freq = remove_infrequent_words(loss_wc)

        gain_only, loss_only = remove_shared_keys(gain_freq, loss_freq)
    except Exception as e:
        print(e)
        return {}, {}
    
    return gain_only, loss_only

In [138]:
#gain_only, loss_only = process_ticker("FB")

In [139]:
#gain_only

In [140]:
#loss_only

In [142]:
all_gain = []
all_loss = []
for ticker in ["TWTR", "FB", "MSFT", "ADBE", "AAPL", "SNAP", "AMZN", "NCL", "DIS", "NFLX"]:
    gain_only, loss_only = process_ticker(ticker)
    all_gain.append(gain_only)
    all_loss.append(loss_only)
    print(ticker)



KeyboardInterrupt: 

In [None]:
def combine_dict_list(list_of_dicts):
    single_dict = {}
    for d in list_of_dicts:
        for i in d:
            if(i not in single_dict):
                single_dict[i] = d.get(i)
            else:
                single_dict.update({i:single_dict.get(i)+d.get(i)})
    return dict(sorted(single_dict.items(), key=lambda x: x[1], reverse=True))

In [None]:
gain, loss = remove_shared_keys(combine_dict_list(all_gain), combine_dict_list(all_loss))

In [None]:
gain

In [None]:
loss