# Extract Stock Ticker Mentions from Wallstreet Bets Data

In [1]:
import csv
import pandas as pd
import yfinance as yf

In [2]:
# import Wallstreet Bets csv file
csvfile = "./Resources/reddit_wsb.csv"
wsb_csv = pd.read_csv(csvfile)
wsb_csv.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,timestamp
0,"It's not about the money, it's about sending a...",55,l6ulcx,https://v.redd.it/6j75regs72e61,6,1611863000.0,,2021-01-28 21:37:41
1,Math Professor Scott Steiner says the numbers ...,110,l6uibd,https://v.redd.it/ah50lyny62e61,23,1611862000.0,,2021-01-28 21:32:10
2,Exit the system,0,l6uhhn,https://www.reddit.com/r/wallstreetbets/commen...,47,1611862000.0,The CEO of NASDAQ pushed to halt trading “to g...,2021-01-28 21:30:35
3,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,29,l6ugk6,https://sec.report/Document/0001193125-21-019848/,74,1611862000.0,,2021-01-28 21:28:57
4,"Not to distract from GME, just thought our AMC...",71,l6ufgy,https://i.redd.it/4h2sukb662e61.jpg,156,1611862000.0,,2021-01-28 21:26:56


In [3]:
#replace body n/a with empty string
wsb_csv['body'] = wsb_csv['body'].fillna("")

#merge title and body together for comprehensive field search
wsb_csv['title_body'] = wsb_csv['title'] + wsb_csv['body']

#drop unneeded columns
wsb_csv.drop(columns=['score', 'id','url','comms_num','created', 'title', 'body'], inplace=True)

#convert search field to uppercase to allow desensitize case for searching
wsb_csv['title_body'] = wsb_csv['title_body'].str.upper()

#drop time from date timestamp
wsb_csv['timestamp'] = pd.to_datetime(wsb_csv['timestamp']).dt.date

#add weekday to df, need to change timestamp to mirror with stock ticker data
# wsb_csv['weekday'] = pd.to_datetime(wsb_csv['timestamp']).dt.day_name()

In [4]:
#create dictionary with term list for each ticker
ticker_list = {'AMC': ['AMC'], 'GME': ['GME'], 'CLOV': ['CLOV'], 'BB': ['BB'], 'CLF': ['CLF'], 'CLNE': ['CLNE'],
               'WKHS': ['WKHS'], 'SENS': ['SENS'], 'PLTR': ['PLTR'], 'RKT': ['RKT'], 'WWE': ['WWE'], 'FORD': ['FORD']}

#loop through dataframe to identify number of mentions for each ticker
for ticker, stock_names in ticker_list.items():
    for name in stock_names:
        wsb_csv[ticker] = wsb_csv['title_body'].str.contains(name).astype(int)

wsb_csv.head()

Unnamed: 0,timestamp,title_body,AMC,GME,CLOV,BB,CLF,CLNE,WKHS,SENS,PLTR,RKT,WWE,FORD
0,2021-01-28,"IT'S NOT ABOUT THE MONEY, IT'S ABOUT SENDING A...",0,0,0,0,0,0,0,0,0,0,0,0
1,2021-01-28,MATH PROFESSOR SCOTT STEINER SAYS THE NUMBERS ...,0,0,0,0,0,0,0,0,0,0,0,0
2,2021-01-28,EXIT THE SYSTEMTHE CEO OF NASDAQ PUSHED TO HAL...,0,1,0,0,0,0,0,0,0,0,0,0
3,2021-01-28,NEW SEC FILING FOR GME! CAN SOMEONE LESS RETAR...,0,1,0,0,0,0,0,0,0,0,0,0
4,2021-01-28,"NOT TO DISTRACT FROM GME, JUST THOUGHT OUR AMC...",1,1,0,0,0,0,0,0,0,0,0,0


In [5]:
#group number of mentions by the date mentioned, move date from index to column
mentions_by_ticker_df = wsb_csv.groupby(["timestamp"]).sum()
mentions_by_ticker_df.reset_index(inplace=True)
mentions_by_ticker_df

Unnamed: 0,timestamp,AMC,GME,CLOV,BB,CLF,CLNE,WKHS,SENS,PLTR,RKT,WWE,FORD
0,2020-09-29,0,0,0,0,0,0,0,0,0,0,0,0
1,2021-01-28,214,419,1,123,0,0,0,11,7,0,1,7
2,2021-01-29,2276,4677,0,1488,0,0,0,87,41,4,3,97
3,2021-01-30,151,528,0,117,0,0,1,23,8,2,1,30
4,2021-01-31,98,410,0,84,1,0,0,25,7,0,0,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...
118,2021-05-28,38,13,0,19,0,0,0,7,4,4,0,9
119,2021-05-29,14,2,2,6,0,1,0,2,0,1,0,1
120,2021-05-30,10,8,0,7,0,0,0,6,1,0,0,2
121,2021-05-31,3,5,0,8,0,0,0,2,0,0,0,5


In [6]:
#unpivot columns using Pandas melt to consolidate all mentions into one column
columns = ['AMC', 'GME', 'CLOV', 'BB', 'CLF', 'CLNE', 'WKHS', 'SENS', 'PLTR', 'RKT', 'WWE', 'FORD']
consolidated_mentions = pd.melt(mentions_by_ticker_df, id_vars=['timestamp'], value_vars = columns,
                               var_name='ticker', value_name='mention_count')
consolidated_mentions

Unnamed: 0,timestamp,ticker,mention_count
0,2020-09-29,AMC,0
1,2021-01-28,AMC,214
2,2021-01-29,AMC,2276
3,2021-01-30,AMC,151
4,2021-01-31,AMC,98
...,...,...,...
1471,2021-05-28,FORD,9
1472,2021-05-29,FORD,1
1473,2021-05-30,FORD,2
1474,2021-05-31,FORD,5


In [7]:
# delete all rows mention count equal to zero
indexNames = consolidated_mentions[consolidated_mentions['mention_count'] ==0].index
consolidated_mentions.drop(indexNames, inplace=True)

#reset the index after dropping zeroes
consolidated_mentions.reset_index(drop=True)

Unnamed: 0,timestamp,ticker,mention_count
0,2021-01-28,AMC,214
1,2021-01-29,AMC,2276
2,2021-01-30,AMC,151
3,2021-01-31,AMC,98
4,2021-02-01,AMC,89
...,...,...,...
912,2021-05-26,FORD,2
913,2021-05-28,FORD,9
914,2021-05-29,FORD,1
915,2021-05-30,FORD,2


# Extract stock ticker close prices from Yahoo Finance

In [14]:
#download stock ticker data from Jan 1-Jun 1 (same time period as Wallstreet Bets)
data = yf.download('GME AMC CLOV BB CLF CLNE WKHS SENS PLTR RKT WWE FORD', start="2021-01-01", end="2021-06-01")
data.head()

[*********************100%***********************]  12 of 12 completed


Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,AMC,BB,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,...,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,WKHS,WWE
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2021-01-04,2.01,6.58,14.42,7.83,15.61,1.88,17.25,23.370001,19.115538,0.93,...,13943600,6023300,21001700,117800,10022500,44970400,6486900,9844900,19044700,744800
2021-01-05,1.98,6.77,15.7,8.68,15.09,1.89,17.370001,24.6,19.010401,0.95,...,17867500,13054700,13198800,123100,4961500,29050400,14297700,31283500,13573300,549800
2021-01-06,2.01,6.71,17.42,9.0,14.53,1.85,18.360001,23.540001,18.494284,0.898,...,27988700,13370400,26843600,322000,6056200,32732900,16289300,9545400,18997800,740700
2021-01-07,2.05,7.06,17.67,11.1,16.02,1.91,18.08,25.0,18.847919,0.88,...,18574800,21143600,20563800,95200,6129300,32240000,8838300,8125700,24390400,573900
2021-01-08,2.14,7.56,18.040001,10.25,15.9,1.92,17.690001,25.200001,19.067749,0.867,...,15849300,17507800,15380000,84200,6482000,41313800,6596800,4864000,21290600,574200


In [15]:
#drop columns not needed
data.drop(columns=["Close", "High", "Low", "Open", "Volume"], inplace=True)
data.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close
Unnamed: 0_level_1,AMC,BB,CLF,CLNE,CLOV,FORD,GME,PLTR,RKT,SENS,WKHS,WWE
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
2021-01-04,2.01,6.58,14.42,7.83,15.61,1.88,17.25,23.370001,19.115538,0.93,21.42,47.122772
2021-01-05,1.98,6.77,15.7,8.68,15.09,1.89,17.370001,24.6,19.010401,0.95,22.43,47.921124
2021-01-06,2.01,6.71,17.42,9.0,14.53,1.85,18.360001,23.540001,18.494284,0.898,23.65,48.400135
2021-01-07,2.05,7.06,17.67,11.1,16.02,1.91,18.08,25.0,18.847919,0.88,27.6,48.33028
2021-01-08,2.14,7.56,18.040001,10.25,15.9,1.92,17.690001,25.200001,19.067749,0.867,25.565001,48.140675


In [16]:
data.columns = data.columns.droplevel(0)
data.reset_index(inplace=True)
data

In [None]:
GME_price_history = data.iloc[:, data.columns.get_level_values(1)=='GME']
AMC_price_history = data.iloc[:, data.columns.get_level_values(1)=='AMC']
BB_price_history = data.iloc[:, data.columns.get_level_values(1)=='BB']
CLF_price_history = data.iloc[:, data.columns.get_level_values(1)=='CLF']
CLNE_price_history = data.iloc[:, data.columns.get_level_values(1)=='CLNE']
FORD_price_history = data.iloc[:, data.columns.get_level_values(1)=='FORD']
PLTR_price_history = data.iloc[:, data.columns.get_level_values(1)=='PLTR']
RKT_price_history = data.iloc[:, data.columns.get_level_values(1)=='RKT']
SENS_price_history = data.iloc[:, data.columns.get_level_values(1)=='SENS']
WKHS_price_history = data.iloc[:, data.columns.get_level_values(1)=='WKHS']
WWE_price_history = data.iloc[:, data.columns.get_level_values(1)=='WWE']

In [None]:
GME_price_history.columns = GME_price_history.columns.droplevel(1)
AMC_price_history.columns = AMC_price_history.columns.droplevel(1)
BB_price_history.columns = BB_price_history.columns.droplevel(1)
CLF_price_history.columns = CLF_price_history.columns.droplevel(1)
CLNE_price_history.columns = CLNE_price_history.columns.droplevel(1)
FORD_price_history.columns = FORD_price_history.columns.droplevel(1)
PLTR_price_history.columns = PLTR_price_history.columns.droplevel(1)
RKT_price_history.columns = RKT_price_history.columns.droplevel(1)
SENS_price_history.columns = SENS_price_history.columns.droplevel(1)
WKHS_price_history.columns = WKHS_price_history.columns.droplevel(1)
WWE_price_history.columns = WWE_price_history.columns.droplevel(1)

In [None]:
GME_price_history['Percent change'] = None
AMC_price_history['Percent change'] = None
BB_price_history['Percent change'] = None
CLF_price_history['Percent change'] = None
CLNE_price_history['Percent change'] = None
FORD_price_history['Percent change'] = None
PLTR_price_history['Percent change'] = None
RKT_price_history['Percent change'] = None
SENS_price_history['Percent change'] = None
WKHS_price_history['Percent change'] = None
WWE_price_history['Percent change'] = None

GME_price_history['Percent change'] = GME_price_history.pct_change()
AMC_price_history['Percent change'] = AMC_price_history.pct_change()
BB_price_history['Percent change'] = BB_price_history.pct_change()
CLF_price_history['Percent change'] = CLF_price_history.pct_change()
CLNE_price_history['Percent change'] = CLNE_price_history.pct_change()
FORD_price_history['Percent change'] = FORD_price_history.pct_change()
PLTR_price_history['Percent change'] = PLTR_price_history.pct_change()
RKT_price_history['Percent change'] = RKT_price_history.pct_change()
SENS_price_history['Percent change'] = SENS_price_history.pct_change()
WKHS_price_history['Percent change'] = WKHS_price_history.pct_change()
WWE_price_history['Percent change'] = WWE_price_history.pct_change()

In [None]:
AMC_price_history.head()

In [None]:
GME_price_history['Ticker'] = None
AMC_price_history['Ticker'] = None
BB_price_history["Ticker"] = None
CLF_price_history["Ticker"] = None
CLNE_price_history["Ticker"] = None
FORD_price_history["Ticker"] = None
PLTR_price_history["Ticker"] = None
RKT_price_history["Ticker"] = None
SENS_price_history["Ticker"] = None
WKHS_price_history["Ticker"] = None
WWE_price_history["Ticker"] = None

GME_price_history.loc[:,'Ticker'] = 'GME'
AMC_price_history.loc[:,'Ticker'] = 'AMC'
BB_price_history.loc[:,"Ticker"]= 'BB'
CLF_price_history.loc[:,"Ticker"]= 'CLF'
CLNE_price_history.loc[:,"Ticker"]= 'CLNE'
FORD_price_history.loc[:,"Ticker"]= 'FORD'
PLTR_price_history.loc[:,"Ticker"]= 'PLTR'
RKT_price_history.loc[:,"Ticker"]= 'RKT'
SENS_price_history.loc[:,"Ticker"]= 'SENS'
WKHS_price_history.loc[:,"Ticker"]= 'WKHS'
WWE_price_history.loc[:,"Ticker"] = 'WWE'

In [None]:
GME_price_history.reset_index(inplace=True)
AMC_price_history.reset_index(inplace=True)
BB_price_history.reset_index(inplace=True)
CLF_price_history.reset_index(inplace=True)
CLNE_price_history.reset_index(inplace=True)
FORD_price_history.reset_index(inplace=True)
PLTR_price_history.reset_index(inplace=True)
RKT_price_history.reset_index(inplace=True)
SENS_price_history.reset_index(inplace=True)
WKHS_price_history.reset_index(inplace=True)
WWE_price_history.reset_index(inplace=True)

In [None]:
AMC_price_history.head()

In [None]:
stock_price_changes = GME_price_history.append(AMC_price_history)
stock_price_changes = stock_price_changes.append(BB_price_history)
stock_price_changes = stock_price_changes.append(CLF_price_history)
stock_price_changes = stock_price_changes.append(CLNE_price_history)
stock_price_changes = stock_price_changes.append(FORD_price_history)
stock_price_changes = stock_price_changes.append(PLTR_price_history)
stock_price_changes = stock_price_changes.append(RKT_price_history)
stock_price_changes = stock_price_changes.append(SENS_price_history)
stock_price_changes = stock_price_changes.append(WKHS_price_history)
stock_price_changes = stock_price_changes.append(WWE_price_history)
stock_price_changes

In [None]:
stock_price_changes['Percent change'] = stock_price_changes['Percent change'].fillna(0)
stock_price_changes

In [None]:
stock_price_changes.to_csv("Stock_Price_Changes.csv")