# Extract Stock Ticker Mentions from Wallstreet Bets Data

In [1]:
#import dependencies
import csv
import pandas as pd
import yfinance as yf
import numpy as np
import datetime as dt

In [2]:
# import Wallstreet Bets csv file from Kaggle
# url= https://www.kaggle.com/gpreda/reddit-wallstreetsbets-posts
csvfile = "../Resources/reddit_wsb.csv"
wsb_csv = pd.read_csv(csvfile)
wsb_csv = pd.read_csv(csvfile, parse_dates=['timestamp'], infer_datetime_format=True)
wsb_csv.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [3]:
wsb_csv.dtypes

title                object
score                 int64
id                   object
url                  object
comms_num             int64
created             float64
body                 object
timestamp    datetime64[ns]
dtype: object

In [4]:
#replace body n/a with empty string
wsb_csv['body'] = wsb_csv['body'].fillna("")

#merge title and body together for comprehensive field search
wsb_csv['title_body'] = wsb_csv['title'] + wsb_csv['body']

#drop unneeded columns
wsb_csv.drop(columns=['score', 'id','url','comms_num','created', 'title', 'body'], inplace=True)

#convert search field to uppercase to allow desensitize case for searching
wsb_csv['title_body'] = wsb_csv['title_body'].str.upper()

#drop time from date timestamp
wsb_csv['timestamp'] = pd.to_datetime(wsb_csv['timestamp']).dt.date

#add weekday to df, need to change timestamp to mirror with stock ticker data
wsb_csv['weekday'] = pd.to_datetime(wsb_csv['timestamp']).dt.day_name()

wsb_csv.head()

Unnamed: 0,timestamp,title_body,weekday
0,2021-01-28,"IT'S NOT ABOUT THE MONEY, IT'S ABOUT SENDING A...",Thursday
1,2021-01-28,MATH PROFESSOR SCOTT STEINER SAYS THE NUMBERS ...,Thursday
2,2021-01-28,EXIT THE SYSTEMTHE CEO OF NASDAQ PUSHED TO HAL...,Thursday
3,2021-01-28,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,Thursday
4,2021-01-28,"NOT TO DISTRACT FROM GME, JUST THOUGHT OUR AMC...",Thursday


In [5]:
#create dictionary with term list for each ticker
ticker_list = {'AMC': ['AMC'], 'GME': ['GME'], 'CLOV': ['CLOV'], 'BB': ['BB'], 'CLF': ['CLF'], 'CLNE': ['CLNE'],
               'WKHS': ['WKHS'], 'SENS': ['SENS'], 'PLTR': ['PLTR'], 'RKT': ['RKT'], 'WWE': ['WWE'], 'FORD': ['FORD']}

#loop through dataframe to identify number of mentions for each ticker
for ticker, stock_names in ticker_list.items():
    for name in stock_names:
        wsb_csv[ticker] = wsb_csv['title_body'].str.contains(name).astype(int)

wsb_csv.head()

Unnamed: 0,timestamp,title_body,weekday,AMC,GME,CLOV,BB,CLF,CLNE,WKHS,SENS,PLTR,RKT,WWE,FORD
0,2021-01-28,"IT'S NOT ABOUT THE MONEY, IT'S ABOUT SENDING A...",Thursday,0,0,0,0,0,0,0,0,0,0,0,0
1,2021-01-28,MATH PROFESSOR SCOTT STEINER SAYS THE NUMBERS ...,Thursday,0,0,0,0,0,0,0,0,0,0,0,0
2,2021-01-28,EXIT THE SYSTEMTHE CEO OF NASDAQ PUSHED TO HAL...,Thursday,0,1,0,0,0,0,0,0,0,0,0,0
3,2021-01-28,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,Thursday,0,1,0,0,0,0,0,0,0,0,0,0
4,2021-01-28,"NOT TO DISTRACT FROM GME, JUST THOUGHT OUR AMC...",Thursday,1,1,0,0,0,0,0,0,0,0,0,0


In [6]:
#group number of mentions by the date mentioned, move date from index to column
mentions_by_ticker_df = wsb_csv.groupby(["timestamp"]).sum()
mentions_by_ticker_df.reset_index(inplace=True)
mentions_by_ticker_df['timestamp'] = mentions_by_ticker_df['timestamp'].astype('datetime64[ns]')
mentions_by_ticker_df

Unnamed: 0,timestamp,AMC,GME,CLOV,BB,CLF,CLNE,WKHS,SENS,PLTR,RKT,WWE,FORD
0,2020-09-29,0,0,0,0,0,0,0,0,0,0,0,0
1,2021-01-28,214,419,1,123,0,0,0,11,7,0,1,7
2,2021-01-29,2276,4677,0,1488,0,0,0,87,41,4,3,97
3,2021-01-30,151,528,0,117,0,0,1,23,8,2,1,30
4,2021-01-31,98,410,0,84,1,0,0,25,7,0,0,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,2021-05-28,38,13,0,19,0,0,0,7,4,4,0,9
119,2021-05-29,14,2,2,6,0,1,0,2,0,1,0,1
120,2021-05-30,10,8,0,7,0,0,0,6,1,0,0,2
121,2021-05-31,3,5,0,8,0,0,0,2,0,0,0,5


In [7]:
#chose list of popular stocks from url = https://memestocks.org

#unpivot columns using Pandas melt to consolidate all mentions into one column
columns = ['AMC', 'GME', 'CLOV', 'BB', 'CLF', 'CLNE', 'WKHS', 'SENS', 'PLTR', 'RKT', 'WWE', 'FORD']
consolidated_mentions = pd.melt(mentions_by_ticker_df, id_vars=['timestamp'], value_vars = columns,
                               var_name='ticker', value_name='mention_count')
consolidated_mentions

Unnamed: 0,timestamp,ticker,mention_count
0,2020-09-29,AMC,0
1,2021-01-28,AMC,214
2,2021-01-29,AMC,2276
3,2021-01-30,AMC,151
4,2021-01-31,AMC,98
...,...,...,...
1471,2021-05-28,FORD,9
1472,2021-05-29,FORD,1
1473,2021-05-30,FORD,2
1474,2021-05-31,FORD,5


In [8]:
# delete random September date
indexNames = consolidated_mentions[consolidated_mentions['timestamp']=='2020-09-29'].index
consolidated_mentions.drop(indexNames, inplace=True)
consolidated_mentions.reset_index(drop=True, inplace=True)
consolidated_mentions.head()

Unnamed: 0,timestamp,ticker,mention_count
0,2021-01-28,AMC,214
1,2021-01-29,AMC,2276
2,2021-01-30,AMC,151
3,2021-01-31,AMC,98
4,2021-02-01,AMC,89


In [9]:
# # delete all rows mention count equal to zero
# indexNames = consolidated_mentions[consolidated_mentions['mention_count'] ==0].index
# consolidated_mentions.drop(indexNames, inplace=True)

# #reset the index after dropping zeroes
# consolidated_mentions.reset_index(drop=True)

# consolidated_mentions.head()

In [10]:
#add weekday to df, need to change timestamp to mirror with stock ticker data
consolidated_mentions['weekday'] = pd.to_datetime(consolidated_mentions['timestamp']).dt.day_name()

#create a new column to identify weekends and how many days to add
consolidated_mentions["days_to_add"] = 0

#add 2 days on Saturday to match to Monday's stock data
consolidated_mentions.loc[consolidated_mentions.weekday=="Saturday", 'days_to_add'] = 2

#add 1 day to Sunday to match to Monday's stock data
consolidated_mentions.loc[consolidated_mentions.weekday=="Sunday", 'days_to_add'] = 1

#add days to holidays stock data
consolidated_mentions.loc[consolidated_mentions.timestamp=='2021-02-15', 'days_to_add'] = 1
consolidated_mentions.loc[consolidated_mentions.timestamp=='2021-04-02', 'days_to_add'] = 3
consolidated_mentions.loc[consolidated_mentions.timestamp=='2021-05-31', 'days_to_add'] = 1

#adjust mention date based on days_to_add, this is to line up with the Yahoo Finance data later
temp = consolidated_mentions['days_to_add'].apply(np.ceil).apply(lambda x: pd.Timedelta(x, unit='D'))
consolidated_mentions['Date'] = consolidated_mentions['timestamp'] + temp

#review changes to ensure accuracy
consolidated_mentions.groupby('weekday').max('days_to_add')

Unnamed: 0_level_0,mention_count,days_to_add
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1
Friday,4677,3
Monday,342,1
Saturday,939,2
Sunday,410,1
Thursday,544,0
Tuesday,450,0
Wednesday,738,0


In [11]:
total_mentions = consolidated_mentions.groupby(['Date', 'ticker']).sum()
total_mentions.reset_index(inplace=True)

In [12]:
total_mentions.loc[total_mentions['ticker']=='AMC']

Unnamed: 0,Date,ticker,mention_count,days_to_add
0,2021-01-28,AMC,214,0
12,2021-01-29,AMC,2276,0
24,2021-02-01,AMC,338,3
36,2021-02-02,AMC,125,0
48,2021-02-03,AMC,244,0
...,...,...,...,...
984,2021-05-26,AMC,10,0
996,2021-05-27,AMC,15,0
1008,2021-05-28,AMC,38,0
1020,2021-05-31,AMC,24,3


In [13]:
#drop unneeded columns
total_mentions.drop(columns=['days_to_add'], inplace=True)
total_mentions.head()

Unnamed: 0,Date,ticker,mention_count
0,2021-01-28,AMC,214
1,2021-01-28,BB,123
2,2021-01-28,CLF,0
3,2021-01-28,CLNE,0
4,2021-01-28,CLOV,1


In [14]:
#export mention count from Wallstreet Bets to csv
total_mentions.to_csv("../Resources/Stock_Mention_Count.csv", index = False)

# Extract stock ticker close prices from Yahoo Finance

In [15]:
#download stock ticker data from Jan 1-Jun 1 (same time period as Wallstreet Bets)
data = yf.download('GME AMC CLOV BB CLF CLNE WKHS SENS PLTR RKT WWE FORD', start="2021-01-28", end="2021-06-01")
data.head()

[*********************100%***********************]  12 of 12 completed


Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AMC,BB,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,...,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,WKHS,WWE
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-28,8.63,14.65,16.030001,10.44,14.07,3.02,193.600006,35.66,20.367605,2.44,...,11008800,8168900,17912600,446500,58815800,85250700,13670400,40266900,20809500,912600
2021-01-29,13.26,14.1,15.34,10.23,13.95,3.15,325.0,35.18,20.415394,2.37,...,15367600,5856000,12310600,286600,50566100,42030900,21610100,37910100,23194000,1573800
2021-02-01,13.3,14.63,15.68,10.57,13.81,3.91,225.0,33.959999,20.587435,2.69,...,7471500,4525700,10178500,1647600,37382200,43460900,9663600,51758500,23995700,775900
2021-02-02,7.82,11.55,15.18,10.47,13.97,3.44,90.0,31.02,20.013968,2.74,...,8658200,4281900,11088900,837500,78183100,69647900,10117100,37810100,20022600,947300
2021-02-03,8.97,12.0,15.17,12.83,13.95,3.6,92.410004,31.76,20.272026,2.73,...,8246900,17966300,8780100,379600,42698500,35312000,12375000,26361200,11808800,1718100


In [16]:
#drop columns not needed
data.drop(columns=["Close", "High", "Low", "Open", "Volume"], inplace=True)
data.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close
Unnamed: 0_level_1,AMC,BB,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,WKHS,WWE
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2021-01-28,8.63,14.65,16.030001,10.44,14.07,3.02,193.600006,35.66,20.367605,2.44,33.099998,56.373688
2021-01-29,13.26,14.1,15.34,10.23,13.95,3.15,325.0,35.18,20.415394,2.37,34.32,56.214016
2021-02-01,13.3,14.63,15.68,10.57,13.81,3.91,225.0,33.959999,20.587435,2.69,38.860001,57.461441
2021-02-02,7.82,11.55,15.18,10.47,13.97,3.44,90.0,31.02,20.013968,2.74,34.209999,57.042305
2021-02-03,8.97,12.0,15.17,12.83,13.95,3.6,92.410004,31.76,20.272026,2.73,36.049999,56.393642


In [17]:
data.columns = data.columns.droplevel(0)
data.reset_index(inplace=True)
data

Unnamed: 0,Date,AMC,BB,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,WKHS,WWE
0,2021-01-28,8.630000,14.65,16.030001,10.44,14.07,3.02,193.600006,35.660000,20.367605,2.44,33.099998,56.373688
1,2021-01-29,13.260000,14.10,15.340000,10.23,13.95,3.15,325.000000,35.180000,20.415394,2.37,34.320000,56.214016
2,2021-02-01,13.300000,14.63,15.680000,10.57,13.81,3.91,225.000000,33.959999,20.587435,2.69,38.860001,57.461441
3,2021-02-02,7.820000,11.55,15.180000,10.47,13.97,3.44,90.000000,31.020000,20.013968,2.74,34.209999,57.042305
4,2021-02-03,8.970000,12.00,15.170000,12.83,13.95,3.60,92.410004,31.760000,20.272026,2.73,36.049999,56.393642
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80,2021-05-24,13.680000,8.62,18.559999,7.96,6.92,2.53,180.009995,21.219999,16.900000,1.99,8.310000,57.919998
81,2021-05-25,16.410000,8.59,18.200001,7.63,7.02,2.40,209.429993,21.480000,16.809999,1.86,8.140000,57.279999
82,2021-05-26,19.559999,9.44,18.920000,8.04,7.33,2.44,242.559998,22.120001,17.459999,1.97,9.130000,56.880001
83,2021-05-27,26.520000,9.97,20.200001,8.09,7.83,2.83,254.130005,22.700001,18.350000,2.07,9.620000,56.669998


In [18]:
#use natural log from numpy to get stock price changes
data['AMC_pct_chg']=np.log(data['AMC']/data['AMC'].shift(1))
data['BB_pct_chg']=np.log(data['BB']/data['BB'].shift(1))
data['CLF_pct_chg']=np.log(data['CLF']/data['CLF'].shift(1))
data['CLNE_pct_chg']=np.log(data['CLNE']/data['CLNE'].shift(1))
data['CLOV_pct_chg']=np.log(data['CLOV']/data['CLOV'].shift(1))
data['FORD_pct_chg']=np.log(data['FORD']/data['FORD'].shift(1))
data['GME_pct_chg']=np.log(data['GME']/data['GME'].shift(1))
data['PLTR_pct_chg']=np.log(data['PLTR']/data['PLTR'].shift(1))
data['RKT_pct_chg']=np.log(data['RKT']/data['RKT'].shift(1))
data['SENS_pct_chg']=np.log(data['SENS']/data['SENS'].shift(1))
data['WKHS_pct_chg']=np.log(data['WKHS']/data['WKHS'].shift(1))
data['WWE_pct_chg']=np.log(data['WWE']/data['WWE'].shift(1))

In [19]:
#drop unneeded columns with original adjusted close price
yf_columns = ['AMC', 'BB', 'CLF', 'CLNE', 'CLOV', 'FORD', 'GME', 'PLTR', 'RKT', 'SENS', 'WKHS', 'WWE']
data.drop(columns=yf_columns, inplace=True)

In [20]:
#unpivot columns using Pandas melt to consolidate all mentions into one column
columns = ['AMC_pct_chg', 'BB_pct_chg', 'CLF_pct_chg', 'CLNE_pct_chg', 'CLOV_pct_chg', 'FORD_pct_chg', 'GME_pct_chg',
           'PLTR_pct_chg', 'RKT_pct_chg', 'SENS_pct_chg', 'WKHS_pct_chg', 'WWE_pct_chg']
consolidated_yf_data = pd.melt(data, id_vars=['Date'], value_vars = columns,
                               var_name='ticker', value_name='percent_change')

#first day of each ticker data is n/a, replace with zeroes
consolidated_yf_data['percent_change'] = consolidated_yf_data['percent_change'].fillna(0)

#trim ticker names
consolidated_yf_data['ticker']=consolidated_yf_data['ticker'].str.replace('_pct_chg', '')

consolidated_yf_data

Unnamed: 0,Date,ticker,percent_change
0,2021-01-28,AMC,0.000000
1,2021-01-29,AMC,0.429507
2,2021-02-01,AMC,0.003012
3,2021-02-02,AMC,-0.531079
4,2021-02-03,AMC,0.137201
...,...,...,...
1015,2021-05-24,WWE,0.006409
1016,2021-05-25,WWE,-0.011111
1017,2021-05-26,WWE,-0.007008
1018,2021-05-27,WWE,-0.003699


In [21]:
#export Yahoo Finance stock price changes to csv
consolidated_yf_data.to_csv("../Resources/Stock_Price_Changes.csv", index = False)

In [22]:
merged_df = pd.merge(total_mentions, consolidated_yf_data, how="outer", on=["Date", "ticker"])
merged_df = merged_df[["Date", "ticker", "mention_count", "percent_change"]]
merged_df.head()

Unnamed: 0,Date,ticker,mention_count,percent_change
0,2021-01-28,AMC,214.0,0.0
1,2021-01-28,BB,123.0,0.0
2,2021-01-28,CLF,0.0,0.0
3,2021-01-28,CLNE,0.0,0.0
4,2021-01-28,CLOV,1.0,0.0


In [23]:
#export merged df to csv
merged_df.to_csv("../Resources/Callstreet_Bets.csv", index = False)