In [1]:
# Installing dependencies
#!pip install praw
#!pip install psaw
#!pip install yfinance

In [2]:
import praw
from psaw import PushshiftAPI
import json
import pandas as pd
import yfinance as yf
from datetime import datetime, timedelta
import traceback

# Suppress warning messages
import warnings
#warnings.filterwarnings('ignore')

In [3]:
# Load client_id, secret_id, and user_agent
with open('info.json') as f:
     info = json.load(f)
        
info = dict(info)

In [4]:
# Initialize Reddit and PushshiftAPI instances
reddit = praw.Reddit(client_id=info["client_id"], user_agent=info["user_agent"], client_secret=info["client_secret"])
api = PushshiftAPI(reddit)

In [121]:
# Store results of a search in a DataFrame
"""
subm_dicts = [{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in api.search_submissions(subreddit='stocks', q="TWTR", filter=['url','author', 'title', 'subreddit'], limit=100)]
df = pd.DataFrame(subm_dicts)
df
"""
_ = None

In [6]:
# Get historical stock data for a ticker
twtr = yf.download('TWTR', progress=True)
twtr

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013-11-07,45.099998,50.090000,44.000000,44.900002,44.900002,117701600
2013-11-08,45.930000,46.939999,40.689999,41.650002,41.650002,27925300
2013-11-11,40.500000,43.000000,39.400002,42.900002,42.900002,16113900
2013-11-12,43.660000,43.779999,41.830002,41.900002,41.900002,6316700
2013-11-13,41.029999,42.869999,40.759998,42.599998,42.599998,8688300
...,...,...,...,...,...,...
2022-04-04,47.869999,51.369999,46.860001,49.970001,49.970001,268465400
2022-04-05,53.849998,54.570000,50.560001,50.980000,50.980000,217520100
2022-04-06,50.040001,52.869999,49.299999,50.770000,50.770000,159034700
2022-04-07,50.470001,51.639999,46.549999,48.029999,48.029999,120715600


## Processing Tickers

This initial thought process is not great. I decided that looking for specific words wwould not be a good idea, as it takes much of the context out of the comment. (Something that briefly mentions TSLA, but is actually talking about how great MSFT is would be useless in predicting TSLA stock)

In [9]:
# At close, calculate the real and percent change since last close
def get_diff(ticker_data):
    df = ticker_data.copy()
    real = []
    percent = []
    for index, row in df.reset_index().iterrows():
        if(index == 0):
            real.append(0)
            percent.append(0)
        else:
            real.append(row["Close"]-df.iloc[index-1]["Close"])
            percent.append(real[-1]/df.iloc[index-1]["Close"])
    return real, percent

# Get the reddit posts that mention a certain ticker n days before a large change in stock price
def get_pre_change_posts(ticker, ticker_gain, days=1, limit=1000, subreddit="stocks,stockmarket,stocksandtrading,daytrading,investing,stocks_picks,stockstobuytoday"):
    df = None
    for index, row in ticker_gain.iterrows():
        start_date = datetime.fromtimestamp(row["Date"].timestamp()) + timedelta(hours=6, days=-days)
        end_date = datetime.fromtimestamp(row["Date"].timestamp()) + timedelta(hours=6)
        
        # TODO: Check whether comments would be better than submissions
        
        submissions = api.search_comments(after=start_date, before=end_date, q=ticker, subreddit=subreddit, filter=['url','author', 'title', 'subreddit'], limit=limit)
        if(df is None):
            df = pd.DataFrame([{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in submissions])
        else:
            df = df.append([{k:getattr(praw_obj, k) for k in vars(praw_obj)} for praw_obj in submissions], ignore_index=True)
    return df

def word_counts(df, column="body", min_letters=3):
    counts = {}
    for i in list(df[column]):
        for j in i.split(" "):
            j = ''.join(k for k in j if k.isalnum())
            # Exclude words that are likely tickers
            if(j == j.upper() and len(j) > 1 and len(j) <= 5):
                pass
            elif(len(j) < 3):
                pass
            elif(j not in counts.keys()):
                counts[j.lower()] = 1
            else:
                counts.update({j.lower():counts.get(j.lower())+1})
    return counts

def remove_shared_keys(dict_a, dict_b, cutoff=2):
    a = dict_a.copy()
    b = dict_b.copy()
    
    rm_a = []
    rm_b = []
    for i in a:
        if(i in b):
            if(b.get(i) > 2*a.get(i)):
                rm_a.append(i)
            elif(b.get(i) < 2*a.get(i)):
                rm_b.append(i)
            else:
                rm_a.append(i)
                rm_b.append(i)
    for i in rm_a:
        a.pop(i)
    for i in rm_b:
        b.pop(i)
    return a, b

def remove_infrequent_words(dict_a, min_count=2):
    d = dict_a.copy()
    
    to_remove = []
    for i, x in d.items():
        if(x < min_count):
            to_remove.append(i)
            
    for i in to_remove:
        d.pop(i)
        
    return d

# Generate information for a given ticker
def process_ticker(ticker, gain_cutoff=0.05, loss_cutoff=0.05, limit=100, days=1):
    try:
        ticker_data = yf.download(ticker, progress=False)
        ticker_data.reset_index(inplace=True)
        real, percent = get_diff(ticker_data)

        ticker_data["Real_Change"] = real
        ticker_data["Percent_Change"] = percent

        ticker_gain = ticker_data[ticker_data["Percent_Change"] > gain_cutoff]
        ticker_loss = ticker_data[ticker_data["Percent_Change"] < -loss_cutoff]

        pre_gain = get_pre_change_posts(ticker, ticker_gain, days, limit)
        pre_loss = get_pre_change_posts(ticker, ticker_loss, days, limit)

        gain_wc = dict(sorted(word_counts(pre_gain).items(), key=lambda x: x[1], reverse=True))
        loss_wc = dict(sorted(word_counts(pre_loss).items(), key=lambda x: x[1], reverse=True))

        gain_freq = remove_infrequent_words(gain_wc)
        loss_freq = remove_infrequent_words(loss_wc)

        gain_only, loss_only = remove_shared_keys(gain_freq, loss_freq)
    except Exception as e:
        print(e)
        return {}, {}
    
    return gain_only, loss_only

#gain_only, loss_only = process_ticker("FB")

#gain_only

#loss_only
"""
all_gain = []
all_loss = []
for ticker in ["TWTR", "FB", "MSFT", "ADBE", "AAPL", "SNAP", "AMZN", "NCL", "DIS", "NFLX"]:
    gain_only, loss_only = process_ticker(ticker)
    all_gain.append(gain_only)
    all_loss.append(loss_only)
    print(ticker)

def combine_dict_list(list_of_dicts):
    single_dict = {}
    for d in list_of_dicts:
        for i in d:
            if(i not in single_dict):
                single_dict[i] = d.get(i)
            else:
                single_dict.update({i:single_dict.get(i)+d.get(i)})
    return dict(sorted(single_dict.items(), key=lambda x: x[1], reverse=True))

gain, loss = remove_shared_keys(combine_dict_list(all_gain), combine_dict_list(all_loss))

gain

loss"""
_ = None # This is just to stop automatic output of block commented code

## Better Method (Probably)

Instead of looking at posts/comments the day before and predicting whether the next day will close higher, this will be looking at the posts/comments from the previous day's close to the current day's open and predicting whether the close price will be higher than the open price. 

In [11]:
def daily_change(data):
    change = []
    up = []
    for index, row in data.iterrows():
        change.append(row["Close"]-row["Open"])
        up.append(int(change[-1] > 0))
    return change, up

In [12]:
change, up = daily_change(twtr)

In [15]:
twtr["Daily_Change"] = change
twtr["Positive_Change"] = up

In [16]:
twtr

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Daily_Change,Positive_Change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-11-07,45.099998,50.090000,44.000000,44.900002,44.900002,117701600,-0.199997,0
2013-11-08,45.930000,46.939999,40.689999,41.650002,41.650002,27925300,-4.279999,0
2013-11-11,40.500000,43.000000,39.400002,42.900002,42.900002,16113900,2.400002,1
2013-11-12,43.660000,43.779999,41.830002,41.900002,41.900002,6316700,-1.759998,0
2013-11-13,41.029999,42.869999,40.759998,42.599998,42.599998,8688300,1.570000,1
...,...,...,...,...,...,...,...,...
2022-04-04,47.869999,51.369999,46.860001,49.970001,49.970001,268465400,2.100002,1
2022-04-05,53.849998,54.570000,50.560001,50.980000,50.980000,217520100,-2.869999,0
2022-04-06,50.040001,52.869999,49.299999,50.770000,50.770000,159034700,0.730000,1
2022-04-07,50.470001,51.639999,46.549999,48.029999,48.029999,120715600,-2.440002,0


In [47]:
def get_pre_open_content(data, ticker, start_hour_diff=0, subreddit="stocks,stockmarket,stocksandtrading,daytrading,investing,stocks_picks,stockstobuytoday", limit=100):
    new_col = []
    for index, row in data.iterrows():
        end_time = row.name + timedelta(hours=9, minutes=30)
        start_time = end_time - timedelta(hours=17, minutes=30)
        content = []
        for i in api.search_comments(after=start_time, before=end_time, subreddit=subreddit, q=ticker, filter=['url','author', 'title', 'subreddit'], limit=limit):
            for j in i.body.split("."):
                for k in j.split("\n"):
                    content.append(k)
        new_col.append(content)
    
    return new_col

In [51]:
twtr_head = twtr.head()
twtr["Comments"] = get_pre_open_content(twtr, "TWTR", limit=100)



In [52]:
twtr

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,Daily_Change,Positive_Change,Comments
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2013-11-07,45.099998,50.090000,44.000000,44.900002,44.900002,117701600,-0.199997,0,"[Grab your popcorn and enjoy the TWTR show, ,..."
2013-11-08,45.930000,46.939999,40.689999,41.650002,41.650002,27925300,-4.279999,0,"[How wrong I was, The IPOX index doesn't inc..."
2013-11-11,40.500000,43.000000,39.400002,42.900002,42.900002,16113900,2.400002,1,"[Just FYI, implied vol on TWTR options is goin..."
2013-11-12,43.660000,43.779999,41.830002,41.900002,41.900002,6316700,-1.759998,0,[If you continue to attract new investment cap...
2013-11-13,41.029999,42.869999,40.759998,42.599998,42.599998,8688300,1.570000,1,[Here are the stock that are on my watch list:...
...,...,...,...,...,...,...,...,...,...
2022-04-04,47.869999,51.369999,46.860001,49.970001,49.970001,268465400,2.100002,1,[]
2022-04-05,53.849998,54.570000,50.560001,50.980000,50.980000,217520100,-2.869999,0,"[everything is priced in until it’s not, , ,..."
2022-04-06,50.040001,52.869999,49.299999,50.770000,50.770000,159034700,0.730000,1,"[$TWTR bag holders?, TWTR to $1200, TWTR may b..."
2022-04-07,50.470001,51.639999,46.549999,48.029999,48.029999,120715600,-2.440002,0,[]


### Preparing text for analysis

In [64]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [98]:
def split_sequences(data):
    seqs = []
    vals = []
    for index, row in data.iterrows():
        for comment in row["Comment_Sequences"]:
            if(comment != []):
                seqs.append(comment)
                vals.append(row["Positive_Change"])
    return seqs, vals

In [122]:
def prepare_text(data, train_proportion = 0.8, max_len=20):
    tokenizer = Tokenizer(oov_token = "<OOV>")
    
    # Shuffle the data so that training and testing data are both representative of all timeframes
    shuffled = data.sample(frac=1)
    
    train = shuffled[:int(shuffled.shape[0]*train_proportion)]
    test = shuffled[int(shuffled.shape[0]*train_proportion):]
    
    for comment in train.Comments:
        tokenizer.fit_on_texts(comment)
        
    seqs = []
    for comment in train.Comments:
        seqs.append(tokenizer.texts_to_sequences(comment))
    
    train["Comment_Sequences"] = seqs
    
    X_train, y_train = split_sequences(train)
    
    X_train = pad_sequences(X_train, padding="post", truncating="post", maxlen=max_len)
    
    seqs = []
    for comment in test.Comments:
        seqs.append(tokenizer.texts_to_sequences(comment))
    
    test["Comment_Sequences"] = seqs
    
    X_test, y_test = split_sequences(test)
    
    X_test = pad_sequences(X_test, padding="post", truncating="post", maxlen=max_len)
    
    return X_train, X_test, y_train, y_test

In [123]:
X_train, X_test, y_train, y_test = prepare_text(twtr)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["Comment_Sequences"] = seqs
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["Comment_Sequences"] = seqs


In [125]:
X_train.shape

(7556, 20)

In [127]:
len(y_train)

7556