## Imports and Setup

In [1]:
# Loads in the environment variables for secret key and public key.
# requires pip install python-dotenv
# requires a .env file to be used in the same directory as this notebook
%load_ext dotenv
%dotenv

In [2]:
from stock_scraper import scrape_stocks_from_dict
from reddit_scraper import (get_headers, get_posts,
                            search_flair, search_daily,
                            search_comments, get_daily_discussion_comments)
import os
import requests
import pandas as pd
START_DATE = '2021-01-01'
END_DATE = '2021-12-31'

## Authorization
this is a necessary setup step. The whole thing fails if this is not set up.

In [3]:
headers = get_headers()

In [4]:
# verification that everything works and quick test
requests.get('https://oauth.reddit.com/api/v1/me',
             headers=headers).json()

{'is_employee': False,
 'seen_layout_switch': False,
 'has_visited_new_profile': False,
 'pref_no_profanity': True,
 'has_external_account': False,
 'pref_geopopular': '',
 'seen_redesign_modal': True,
 'pref_show_trending': True,
 'subreddit': {'default_set': True,
  'user_is_contributor': False,
  'banner_img': '',
  'restrict_posting': True,
  'user_is_banned': False,
  'free_form_reports': True,
  'community_icon': None,
  'show_media': True,
  'icon_color': '#94B3FF',
  'user_is_muted': None,
  'display_name': 'u_lucky_nooodle',
  'header_img': None,
  'title': '',
  'coins': 0,
  'previous_names': [],
  'over_18': False,
  'icon_size': [256, 256],
  'primary_color': '',
  'icon_img': 'https://www.redditstatic.com/avatars/defaults/v2/avatar_default_6.png',
  'description': '',
  'allowed_media_in_comments': [],
  'submit_link_label': '',
  'header_size': None,
  'restrict_commenting': False,
  'subscribers': 0,
  'submit_text_label': '',
  'is_default_icon': True,
  'link_flair_po

## Main methods

In [5]:
# basic way to get posts
hot_wsb_posts = get_posts(headers)
hot_wsb_posts = hot_wsb_posts[["kind", "title","selftext",
               "id", "upvote_ratio",
               "score", "ups"]]
hot_wsb_posts.head(5)

Unnamed: 0,kind,title,selftext,id,upvote_ratio,score,ups
0,t3,"What Are Your Moves Tomorrow, May 21, 2024",[View Post](https://new.reddit.com/r/wallstree...,1cwous5,0.85,38,38
1,t3,Most Anticipated Earnings Releases for the wee...,,1cto0z5,0.93,287,287
2,t3,Don't mess with BA!,,1cwa0oj,0.96,4674,4674
3,t3,Hims Debuts $199 Weight-Loss Shots at 85% Disc...,,1cwfvh4,0.94,1226,1226
4,t3,"After ten years, I‚Äôm only up $2.,000","You can clearly see when I discovered options,...",1cwn1wx,0.96,485,485


### Search by Flairs

In [6]:
headers

{'User-Agent': 'MyAPI/0.0.1',
 'Authorization': 'bearer eyJhbGciOiJSUzI1NiIsImtpZCI6IlNIQTI1NjpzS3dsMnlsV0VtMjVmcXhwTU40cWY4MXE2OWFFdWFyMnpLMUdhVGxjdWNZIiwidHlwIjoiSldUIn0.eyJzdWIiOiJ1c2VyIiwiZXhwIjoxNzE2MzI3NjIyLjEyNDQ1OCwiaWF0IjoxNzE2MjQxMjIyLjEyNDQ1OCwianRpIjoiMndOZWJ3ajQ2cU9aSjh3UmRkbGpBeUVaQkNTcFN3IiwiY2lkIjoidm1ZSlFVdTdWN0xzZG8yaldDeWtJUSIsImxpZCI6InQyX2JmOXJ0am5pIiwiYWlkIjoidDJfYmY5cnRqbmkiLCJsY2EiOjE2MTc5MzI4ODUwMDAsInNjcCI6ImVKeUtWdEpTaWdVRUFBRF9fd056QVNjIiwiZmxvIjo5fQ.XoA9vk4sGqfW4KEvQ2HPGnsqLSbydR7Fp85gSCy1pQC6-edChBp4MT-TVL3xc8pE6JWusmnLFaS2QSdQQKJcey0QcfMf3kGYKWwafbhpIbM8xS0kZHuYhGTdtiEWX5_4NiE9spn5Lk-42Zm_mVVYwbXwBneIJL7SS2kMqYL7tcQox_93dRnod7eKt_qtESnXiBfnLJ6gCFuPf4TBYmUlKOxHwwleykqI0pVUV-68Cw0aJ-QNsSaNOJmDmnjLLIIACHVAFwg6WKl8pLskl9z6Uf8_AoO3lTmSYgDNTn5_V7nExvcgWa6DCgVFQCn7w3zCfHciOO3X4tRArl3h9O9TFQ'}

In [7]:
flairs = requests.get('https://oauth.reddit.com/r/wallstreetbets/api/link_flair',
             headers=headers).json()
for f in flairs:
    print(f["text"])

YOLO
DD
Discussion
Gain
Loss
Meme
News
Chart


In [8]:
# simple search for the dd flair posts
search_flair(headers, flair="DD")

Unnamed: 0,kind,approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,...,media_metadata,media_metadata.1,media_metadata.2,media_metadata.3,media_metadata.4,media_metadata.5,media_metadata.6,media_metadata.7,media_metadata.8,media_metadata.9
0,t3,,wallstreetbets,So a lot of people liked the HIMS jump this we...,t2_cltve6wr,False,,0,False,My HIMS DickDilligence,...,,,,,,,,,,
1,t3,,wallstreetbets,FMC Corporation (FMC) produces and sells all 3...,t2_2yajsxi5,False,,0,False,FMC may soar on a spike in pesticide demand,...,,,,,,,,,,
2,t3,,wallstreetbets,It seems all the Hype on Rocket Lab has died a...,t2_9el4q7au,False,,0,False,$RKLB Everybody Left as the Party Gets Started?,...,,,,,,,,,,
3,t3,,wallstreetbets,Tandem Diabetes Care ($TNDM) is likely going t...,t2_225bfdy8,False,,0,False,$TNDM may be getting acquired soon,...,,,,,,,,,,
4,t3,,wallstreetbets,"There's a lot of chatter over OTCs uplisting, ...",t2_7fhv8jas,False,,0,False,$CRON : most likely cannabis company to be acq...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,t3,,wallstreetbets,I think stock is should atleast trade at 2-3x...,t2_qmdmw23h,False,,0,False,GoPro is ridiculously cheap fundamentally.,...,,,,,,,,,,
96,t3,,wallstreetbets,"Listen up, degenerates and fellow regards of W...",t2_hwdqb,False,,0,False,üöÄüåï SOFI: The Sleeper About to Blast Off! üåïüöÄ,...,,,,,,,,,,
97,t3,,wallstreetbets,"**Position:** \n\n2,000 shares of SPCE at $1....",t2_85c1bfc9,False,,0,False,"SPCE - DD and ""infinite"" option hack",...,,,,,,,,,,
98,t3,,wallstreetbets,The stock is ripe for pull back. Somehow META ...,t2_qben5ncs,False,,0,False,My bearish thesis on $META,...,,,,,,,,,,


### Search Daily Discussion Thread

In [9]:
comments = get_daily_discussion_comments(headers)

KeyboardInterrupt: 

In [None]:
comments[-5]

Unnamed: 0,kind,subreddit_id,approved_at_utc,author_is_blocked,comment_type,awarders,mod_reason_by,banned_by,author_flair_type,total_awards_received,...,media_metadata,media_metadata.1,media_metadata.2,media_metadata.3,media_metadata.4,media_metadata.5,media_metadata.6,media_metadata.7,count,children
0,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
1,t1,t5_2th52,,False,,[],,,richtext,0.0,...,,,,,,,,,,
2,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
3,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
4,t1,t5_2th52,,False,,[],,,richtext,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
107,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
108,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
109,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,


In [None]:
len(comments)

100

In [None]:
pd.concat(comments, ignore_index=True)

InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
comments[20]

Unnamed: 0,kind,subreddit_id,approved_at_utc,author_is_blocked,comment_type,awarders,mod_reason_by,banned_by,author_flair_type,total_awards_received,...,media_metadata,media_metadata.1,media_metadata.2,media_metadata.3,media_metadata.4,media_metadata.5,media_metadata.6,media_metadata.7,count,children
0,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
1,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
2,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
3,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
4,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,t1,t5_2th52,,False,,[],,,richtext,0.0,...,,,,,,,,,,
71,t1,t5_2th52,,False,,[],,,richtext,0.0,...,,,,,,,,,,
72,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,
73,t1,t5_2th52,,False,,[],,,text,0.0,...,,,,,,,,,,


#### Daily Discussion Comments

## Do Sentiment Analysis and get Scores

### Rules Based

In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer


In [23]:
# Sample data (replace with your actual data)
data = {'text': ['This movie was great!', 'I am so disappointed.', 'A mediocre film overall.']}
analyzer = SentimentIntensityAnalyzer()
# Define a function to extract the compound score
def vader_sentiment(text):
  score = analyzer.polarity_scores(text)
  return score['compound']

In [24]:
df = pd.DataFrame(data)
df["text"].apply(vader_sentiment)

0    0.6588
1   -0.5256
2    0.0000
Name: text, dtype: float64

In [31]:
hot_wsb_posts["text"] = hot_wsb_posts['title'] + " \n\n " + hot_wsb_posts['selftext']

In [32]:
hot_wsb_posts["text"]

0      What Are Your Moves Tomorrow, May 21, 2024 \n\...
1      Most Anticipated Earnings Releases for the wee...
2                              Don't mess with BA! \n\n 
3      Hims Debuts $199 Weight-Loss Shots at 85% Disc...
4      After ten years, I‚Äôm only up $2.,000 \n\n You ...
                             ...                        
97     September 20, 2024 - If I hit I‚Äôll eat a red c...
98     What if ‚Äúvolatile food and gas prices‚Äù remain ...
99     Kohl's (KSS) Primed for a Nice Move? \n\n High...
100    SBUX will be over $100 by end of 2024 \n\n I g...
101    Hey guys, I just figured out how to beat theta...
Name: text, Length: 102, dtype: object

In [33]:
# Apply the function to the 'text' column and create a new column 'sentiment'
hot_wsb_posts['vader_sentiment'] = hot_wsb_posts['text'].apply(vader_sentiment)

### Embedding Models

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer

# Load data (replace 'your_column_name' with your actual column name)
data = hot_wsb_posts
text_column = data["text"]

# Define model
model = SentenceTransformer("all-mpnet-base-v2")

# Function to generate embeddings
def generate_embedding(text):
  embedding = model.encode(text)
  return embedding.tolist()  # Convert to list for storage

# Create new column for embeddings
data["embeddings"] = text_column.apply(generate_embedding)

# Now 'data' has a new column 'embeddings' with vector representations of the text


In [None]:
# setting some ground truths. We can do better by choosing a collection of 
# good posts and averaging the embeddings, but this is funnier and we live for the meme
buy_embedding = "To the Moon"
sell_embedding = "Sell that stock"

## RAG DB creation

In [22]:
documents = []
for index, row in hot_wsb_posts.iterrows():
    documents.append({"id": row["id"],
                      "text": row["title"] + "\n\n" + row["selftext"]})

Unnamed: 0,kind,title,selftext,id,upvote_ratio,score,ups,sentiment,vader_sentiment
0,t3,"What Are Your Moves Tomorrow, May 21, 2024",[View Post](https://new.reddit.com/r/wallstree...,1cwous5,0.85,38,38,0.0000,0.0000
1,t3,Most Anticipated Earnings Releases for the wee...,,1cto0z5,0.93,287,287,0.0000,0.0000
2,t3,Don't mess with BA!,,1cwa0oj,0.96,4674,4674,0.0000,0.0000
3,t3,Hims Debuts $199 Weight-Loss Shots at 85% Disc...,,1cwfvh4,0.94,1226,1226,0.0000,0.0000
4,t3,"After ten years, I‚Äôm only up $2.,000","You can clearly see when I discovered options,...",1cwn1wx,0.96,485,485,0.4019,0.4019
...,...,...,...,...,...,...,...,...,...
97,t3,"September 20, 2024 - If I hit I‚Äôll eat a red c...",,1cvgbzj,0.80,137,137,0.0000,0.0000
98,t3,What if ‚Äúvolatile food and gas prices‚Äù remain ...,"Even if Fed policy works, and prices for goods...",1cvk8gy,0.75,49,49,-0.8858,-0.8858
99,t3,Kohl's (KSS) Primed for a Nice Move?,"High short interest, undervaluation, and posit...",1cw2vdo,0.54,3,3,0.9719,0.9719
100,t3,SBUX will be over $100 by end of 2024,I guarantee you because daddy laxman by next e...,1cvb2jf,0.77,278,278,-0.7430,-0.7430


## Stock Scraping

In [3]:
stocks_dict = {
    "GME": "GameStop",
    "AAPL": "Apple",
    "AMC": "AMC",
    "BB": "BlackBerry",
    "GME": "GameStop",
    "NOK": "Nokia",
    "NVDA": "Nvidia",
    "PLTR": "Palantir",
    "TSLA": "Tesla",
    "SPY": "SPY",
    "^GSPC": "S&P 500"
}
scrape_stocks_from_dict(stocks_dict, START_DATE, END_DATE)

GameStop (GME)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Apple (AAPL)
AMC (AMC)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


BlackBerry (BB)
Nokia (NOK)


[*********************100%%**********************]  1 of 1 completed


Nvidia (NVDA)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


Palantir (PLTR)
Tesla (TSLA)
SPY (SPY)


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed

S&P 500 (^GSPC)





Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,name,date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-01-04,4.750000,4.775000,4.287500,4.312500,4.312500,40090000,GameStop,2021-01-04
2021-01-05,4.337500,4.520000,4.307500,4.342500,4.342500,19846000,GameStop,2021-01-05
2021-01-06,4.335000,4.745000,4.332500,4.590000,4.590000,24224800,GameStop,2021-01-06
2021-01-07,4.617500,4.862500,4.505000,4.520000,4.520000,24517200,GameStop,2021-01-07
2021-01-08,4.545000,4.575000,4.270000,4.422500,4.422500,25928000,GameStop,2021-01-08
...,...,...,...,...,...,...,...,...
2021-12-23,4703.959961,4740.740234,4703.959961,4725.790039,4725.790039,2913040000,S&P 500,2021-12-23
2021-12-27,4733.990234,4791.490234,4733.990234,4791.189941,4791.189941,2770290000,S&P 500,2021-12-27
2021-12-28,4795.490234,4807.020020,4780.040039,4786.350098,4786.350098,2707920000,S&P 500,2021-12-28
2021-12-29,4788.640137,4804.060059,4778.080078,4793.060059,4793.060059,2963310000,S&P 500,2021-12-29
