# Extract Stock Ticker Mentions from Wallstreet Bets Data

In [1]:
#import dependencies
import csv
import pandas as pd
import yfinance as yf
import numpy as np
import datetime as dt

In [2]:
# import Wallstreet Bets csv file from Kaggle
# url= https://www.kaggle.com/gpreda/reddit-wallstreetsbets-posts
csvfile = "./Resources/reddit_wsb.csv"
wsb_csv = pd.read_csv(csvfile)
wsb_csv = pd.read_csv(csvfile, parse_dates=['timestamp'], infer_datetime_format=True)
wsb_csv.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [3]:
wsb_csv.dtypes

title                object
score                 int64
id                   object
url                  object
comms_num             int64
created             float64
body                 object
timestamp    datetime64[ns]
dtype: object

In [4]:
#replace body n/a with empty string
wsb_csv['body'] = wsb_csv['body'].fillna("")

#merge title and body together for comprehensive field search
wsb_csv['title_body'] = wsb_csv['title'] + wsb_csv['body']

#drop unneeded columns
wsb_csv.drop(columns=['score', 'id','url','comms_num','created', 'title', 'body'], inplace=True)

#convert search field to uppercase to allow desensitize case for searching
wsb_csv['title_body'] = wsb_csv['title_body'].str.upper()

#drop time from date timestamp
wsb_csv['timestamp'] = pd.to_datetime(wsb_csv['timestamp']).dt.date

#add weekday to df, need to change timestamp to mirror with stock ticker data
wsb_csv['weekday'] = pd.to_datetime(wsb_csv['timestamp']).dt.day_name()

wsb_csv.head()

Unnamed: 0,timestamp,title_body,weekday
0,2021-01-28,"IT'S NOT ABOUT THE MONEY, IT'S ABOUT SENDING A...",Thursday
1,2021-01-28,MATH PROFESSOR SCOTT STEINER SAYS THE NUMBERS ...,Thursday
2,2021-01-28,EXIT THE SYSTEMTHE CEO OF NASDAQ PUSHED TO HAL...,Thursday
3,2021-01-28,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,Thursday
4,2021-01-28,"NOT TO DISTRACT FROM GME, JUST THOUGHT OUR AMC...",Thursday


In [None]:
#create dictionary with term list for each ticker
ticker_list = {'AMC': ['AMC'], 'GME': ['GME'], 'CLOV': ['CLOV'], 'BB': ['BB'], 'CLF': ['CLF'], 'CLNE': ['CLNE'],
               'WKHS': ['WKHS'], 'SENS': ['SENS'], 'PLTR': ['PLTR'], 'RKT': ['RKT'], 'WWE': ['WWE'], 'FORD': ['FORD']}

#loop through dataframe to identify number of mentions for each ticker
for ticker, stock_names in ticker_list.items():
    for name in stock_names:
        wsb_csv[ticker] = wsb_csv['title_body'].str.contains(name).astype(int)

wsb_csv.head()

In [None]:
#group number of mentions by the date mentioned, move date from index to column
mentions_by_ticker_df = wsb_csv.groupby(["timestamp"]).sum()
mentions_by_ticker_df.reset_index(inplace=True)
mentions_by_ticker_df['timestamp'] = mentions_by_ticker_df['timestamp'].astype('datetime64[ns]')
mentions_by_ticker_df

In [None]:
#chose list of popular stocks from url = https://memestocks.org

#unpivot columns using Pandas melt to consolidate all mentions into one column
columns = ['AMC', 'GME', 'CLOV', 'BB', 'CLF', 'CLNE', 'WKHS', 'SENS', 'PLTR', 'RKT', 'WWE', 'FORD']
consolidated_mentions = pd.melt(mentions_by_ticker_df, id_vars=['timestamp'], value_vars = columns,
                               var_name='ticker', value_name='mention_count')
consolidated_mentions

In [None]:
# delete random September date
indexNames = consolidated_mentions[consolidated_mentions['timestamp']=='2020-09-29'].index
consolidated_mentions.drop(indexNames, inplace=True)
consolidated_mentions.reset_index(drop=True, inplace=True)
consolidated_mentions.head()

In [None]:
# # delete all rows mention count equal to zero
# indexNames = consolidated_mentions[consolidated_mentions['mention_count'] ==0].index
# consolidated_mentions.drop(indexNames, inplace=True)

# #reset the index after dropping zeroes
# consolidated_mentions.reset_index(drop=True)

# consolidated_mentions.head()

In [None]:
#add weekday to df, need to change timestamp to mirror with stock ticker data
consolidated_mentions['weekday'] = pd.to_datetime(consolidated_mentions['timestamp']).dt.day_name()

#create a new column to identify weekends and how many days to add
consolidated_mentions["days_to_add"] = 0

#add 2 days on Saturday to match to Monday's stock data
consolidated_mentions.loc[consolidated_mentions.weekday=="Saturday", 'days_to_add'] = 2

#add 1 day to Sunday to match to Monday's stock data
consolidated_mentions.loc[consolidated_mentions.weekday=="Sunday", 'days_to_add'] = 1

#add days to holidays stock data
consolidated_mentions.loc[consolidated_mentions.timestamp=='2021-02-15', 'days_to_add'] = 1
consolidated_mentions.loc[consolidated_mentions.timestamp=='2021-04-02', 'days_to_add'] = 3
consolidated_mentions.loc[consolidated_mentions.timestamp=='2021-05-31', 'days_to_add'] = 1

#adjust mention date based on days_to_add, this is to line up with the Yahoo Finance data later
temp = consolidated_mentions['days_to_add'].apply(np.ceil).apply(lambda x: pd.Timedelta(x, unit='D'))
consolidated_mentions['Date'] = consolidated_mentions['timestamp'] + temp

#review changes to ensure accuracy
consolidated_mentions.groupby('weekday').max('days_to_add')

In [None]:
total_mentions = consolidated_mentions.groupby(['Date', 'ticker']).sum()
total_mentions.reset_index(inplace=True)

In [None]:
total_mentions.loc[total_mentions['ticker']=='AMC']

In [None]:
#drop unneeded columns
# consolidated_mentions.drop(columns=['timestamp', 'weekday','days_to_add'], inplace=True)

total_mentions.head()

In [None]:
#export mention count from Wallstreet Bets to csv
total_mentions.to_csv("Stock_Mention_Count.csv", index = False)

# Extract stock ticker close prices from Yahoo Finance

In [None]:
#download stock ticker data from Jan 1-Jun 1 (same time period as Wallstreet Bets)
data = yf.download('GME AMC CLOV BB CLF CLNE WKHS SENS PLTR RKT WWE FORD', start="2021-01-28", end="2021-06-01")
data.head()

In [None]:
#drop columns not needed
data.drop(columns=["Close", "High", "Low", "Open", "Volume"], inplace=True)
data.head()

In [None]:
data.columns = data.columns.droplevel(0)
data.reset_index(inplace=True)
data

In [None]:
#use natural log from numpy to get stock price changes
data['AMC_pct_chg']=np.log(data['AMC']/data['AMC'].shift(1))
data['BB_pct_chg']=np.log(data['BB']/data['BB'].shift(1))
data['CLF_pct_chg']=np.log(data['CLF']/data['CLF'].shift(1))
data['CLNE_pct_chg']=np.log(data['CLNE']/data['CLNE'].shift(1))
data['CLOV_pct_chg']=np.log(data['CLOV']/data['CLOV'].shift(1))
data['FORD_pct_chg']=np.log(data['FORD']/data['FORD'].shift(1))
data['GME_pct_chg']=np.log(data['GME']/data['GME'].shift(1))
data['PLTR_pct_chg']=np.log(data['PLTR']/data['PLTR'].shift(1))
data['RKT_pct_chg']=np.log(data['RKT']/data['RKT'].shift(1))
data['SENS_pct_chg']=np.log(data['SENS']/data['SENS'].shift(1))
data['WKHS_pct_chg']=np.log(data['WKHS']/data['WKHS'].shift(1))
data['WWE_pct_chg']=np.log(data['WWE']/data['WWE'].shift(1))

In [None]:
#drop unneeded columns with original adjusted close price
yf_columns = ['AMC', 'BB', 'CLF', 'CLNE', 'CLOV', 'FORD', 'GME', 'PLTR', 'RKT', 'SENS', 'WKHS', 'WWE']
data.drop(columns=yf_columns, inplace=True)

In [None]:
#unpivot columns using Pandas melt to consolidate all mentions into one column
columns = ['AMC_pct_chg', 'BB_pct_chg', 'CLF_pct_chg', 'CLNE_pct_chg', 'CLOV_pct_chg', 'FORD_pct_chg', 'GME_pct_chg',
           'PLTR_pct_chg', 'RKT_pct_chg', 'SENS_pct_chg', 'WKHS_pct_chg', 'WWE_pct_chg']
consolidated_yf_data = pd.melt(data, id_vars=['Date'], value_vars = columns,
                               var_name='ticker', value_name='percent_change')

#first day of each ticker data is n/a, replace with zeroes
consolidated_yf_data['percent_change'] = consolidated_yf_data['percent_change'].fillna(0)

#trim ticker names
consolidated_yf_data['ticker']=consolidated_yf_data['ticker'].str.replace('_pct_chg', '')

consolidated_yf_data

In [None]:
#export Yahoo Finance stock price changes to csv
consolidated_yf_data.to_csv("Stock_Price_Changes.csv", index = False)

In [None]:
merged_df = pd.merge(total_mentions, consolidated_yf_data, how="outer", on=["Date", "ticker"])
merged_df = merged_df[["Date", "ticker", "mention_count", "percent_change"]]
merged_df.head()

In [None]:
#export merged df to csv
merged_df.to_csv("Callstreet_Bets.csv", index = False)