# Stock Sentiment Analysis on Reddit Data with NER

In [8]:
import spacy
from spacy import displacy
import pandas as pd
from collections import Counter
import flair

In [4]:
nlp = spacy.load('en_core_web_trf')

In [5]:
BLACKLIST = ['ev', 'covid', 'etf', 'nyse', 'sec', 'spac', 'fda',
             'fed', 'treasury', 'eu', 'cnbc', 'faq', 'company']

def get_orgs(text):
    # process the text with our SpaCy model to get named entities
    doc = nlp(text)
    # initialize list to store identified organizations
    org_list = []
    for entity in doc.ents:
        # here we modify the original code to check that entity text is not equal to one of our 'blacklisted' organizations
        # (we also add .lower() to lowercase the text, this allows us to match both 'nyse' and 'NYSE' with just 'nyse')
        if entity.label_ == 'ORG' and entity.text.lower() not in BLACKLIST:
            org_list.append(entity.text)
    # if organization is identified more than once it will appear multiple times in list
    # we use set() to remove duplicates then convert back to list
    org_list = list(set(org_list))
    return org_list

In [6]:
df = pd.read_csv('./data/reddit_investing.csv', sep='|')
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score
0,t3_qwth1y,1637253000.0,investing,"Rather than comparing P/E to History, Compare ...",S&amp;P 500 forward price to earnings ratios a...,1.0,3.0,0.0,3.0
1,t3_qwt3jd,1637252000.0,investing,Sono Group: An EV startup that managed to cont...,"Sono Group ($SEV), an EV startup that went pub...",0.83,4.0,0.0,4.0
2,t3_qwrgj1,1637247000.0,investing,Covered calls tips and questions?,Just a few questions that I'm still confused a...,1.0,1.0,0.0,1.0
3,t3_qwr6np,1637246000.0,investing,Turkey defies warnings and cuts interest rates,The central bank cut its one-week repo rate by...,0.93,32.0,0.0,32.0
4,t3_qwqm7x,1637245000.0,investing,To what extent can cryptoassets be an ally of ...,Could the consumer price index be higher today...,0.36,0.0,0.0,0.0


In [None]:
df['organizations'] = df['selftext'].apply(get_orgs)
df.head()

In [None]:
# merge organizations column into one big list
orgs = df['organizations'].to_list()
orgs = [org for sublist in orgs for org in sublist]
orgs[:10]

In [None]:
from collections import Counter

In [None]:
# create dictionary of organization mention frequency
org_freq = Counter(orgs)

In [None]:
org_freq.most_common(10)

In [11]:
df.to_csv('./data/reddit_investing_ner.csv', sep='|', index=False)

In [12]:
model = flair.models.TextClassifier.load('en-sentiment')

2021-11-20 14:55:40,749 loading file C:\Users\myusu\.flair\models\sentiment-en-mix-distillbert_4.pt


In [9]:
# load data
df = pd.read_csv('./data/reddit_investing_ner.csv', sep='|')
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations
0,t3_qwth1y,1637253000.0,investing,"Rather than comparing P/E to History, Compare ...",S&amp;P 500 forward price to earnings ratios a...,1.0,3.0,0.0,3.0,"['Tesla', 'Fidelity Large Cap Value', 'Roblox'..."
1,t3_qwt3jd,1637252000.0,investing,Sono Group: An EV startup that managed to cont...,"Sono Group ($SEV), an EV startup that went pub...",0.83,4.0,0.0,4.0,"['Sono Group', 'SAAB', 'NEVS', 'Evergrande Hea..."
2,t3_qwrgj1,1637247000.0,investing,Covered calls tips and questions?,Just a few questions that I'm still confused a...,1.0,1.0,0.0,1.0,[]
3,t3_qwr6np,1637246000.0,investing,Turkey defies warnings and cuts interest rates,The central bank cut its one-week repo rate by...,0.93,32.0,0.0,32.0,[]
4,t3_qwqm7x,1637245000.0,investing,To what extent can cryptoassets be an ally of ...,Could the consumer price index be higher today...,0.36,0.0,0.0,0.0,[]


In [13]:
def get_sentiment(text):
    # tokenize input text
    sentence = flair.data.Sentence(text)
    # make sentiment prediction
    model.predict(sentence)
    # extract sentiment direction and confidence (label and score) object
    sentiment = sentence.labels[0]
    return sentiment

In [14]:
# get sentiment
df['sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

Unnamed: 0,name,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations,sentiment
0,t3_qwth1y,1637253000.0,investing,"Rather than comparing P/E to History, Compare ...",S&amp;P 500 forward price to earnings ratios a...,1.0,3.0,0.0,3.0,"['Tesla', 'Fidelity Large Cap Value', 'Roblox'...",NEGATIVE (0.9888)
1,t3_qwt3jd,1637252000.0,investing,Sono Group: An EV startup that managed to cont...,"Sono Group ($SEV), an EV startup that went pub...",0.83,4.0,0.0,4.0,"['Sono Group', 'SAAB', 'NEVS', 'Evergrande Hea...",NEGATIVE (1.0)
2,t3_qwrgj1,1637247000.0,investing,Covered calls tips and questions?,Just a few questions that I'm still confused a...,1.0,1.0,0.0,1.0,[],NEGATIVE (1.0)
3,t3_qwr6np,1637246000.0,investing,Turkey defies warnings and cuts interest rates,The central bank cut its one-week repo rate by...,0.93,32.0,0.0,32.0,[],NEGATIVE (1.0)
4,t3_qwqm7x,1637245000.0,investing,To what extent can cryptoassets be an ally of ...,Could the consumer price index be higher today...,0.36,0.0,0.0,0.0,[],NEGATIVE (0.9921)


In [15]:
import ast

df['organizations'] = df['organizations'].apply(lambda x: ast.literal_eval(x))

In [16]:
# initialize sentiment dictionary
sentiment = {}

# loop through dataframe and extract org labels and sentiment scores into sentiment dictionary
for i, row in df.iterrows():
    # extract sentiment direction and score
    direction = row['sentiment'].value
    score = row['sentiment'].score
    # loop through each label in organizations column
    for org in row['organizations']:
        # check if org label exists in sentiment dictionary already
        if org not in sentiment.keys():
            # if it doesn't, initialize new entry in dictionary
            sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        # append positive/negative score to respective dictionary entry
        sentiment[org][direction].append(score)

In [17]:
sentiment['ARK']

{'POSITIVE': [], 'NEGATIVE': [0.9841217994689941]}

In [18]:
# initialize sentiment list
avg_sentiment = []

# loop through each organization
for org in sentiment.keys():
    # get number of positive and negative ratings
    freq = len(sentiment[org]['POSITIVE']) + len(sentiment[org]['NEGATIVE'])
    pos_freq = len(sentiment[org]['POSITIVE'])
    neg_freq = len(sentiment[org]['NEGATIVE'])
    for direction in ['POSITIVE', 'NEGATIVE']:
        # assign to variable for cleaner code
        score = sentiment[org][direction]
        # if there are no entries, set to 0
        if len(score) == 0:
            sentiment[org][direction] = 0.0
        else:
            # otherwise calculate total
            sentiment[org][direction] = sum(score)
    # now calculate total amount
    total = sentiment[org]['POSITIVE'] - sentiment[org]['NEGATIVE']
    # and the average score
    avg = total/freq
    pos_avg = sentiment[org]['POSITIVE'] / pos_freq if pos_freq != 0 else 0
    neg_avg = sentiment[org]['NEGATIVE'] / neg_freq if neg_freq != 0 else 0
    # add to sentiment list
    avg_sentiment.append({
        'entity': org,
        'positive': pos_avg,
        'negative': neg_avg,
        'frequency': freq,
        'score': avg
    })

In [19]:
sentiment_df = pd.DataFrame(avg_sentiment)
sentiment_df.head()

Unnamed: 0,entity,positive,negative,frequency,score
0,Tesla,0.907746,0.910459,18,-0.203379
1,Fidelity Large Cap Value,0.0,0.988792,1,-0.988792
2,Roblox,0.985825,0.988792,2,-0.001484
3,Nvidia,0.0,0.887186,4,-0.887186
4,Fidelity,0.715206,0.957049,14,-0.837603


In [20]:
sentiment_df = sentiment_df[sentiment_df['frequency'] > 3]
sentiment_df

Unnamed: 0,entity,positive,negative,frequency,score
0,Tesla,0.907746,0.910459,18,-0.203379
3,Nvidia,0.0,0.887186,4,-0.887186
4,Fidelity,0.715206,0.957049,14,-0.837603
6,Intel,0.906587,0.866262,8,-0.644656
7,Rivian,0.829072,0.897259,7,-0.404022
9,AMD,0.906587,0.90769,6,-0.605311
26,Evergrande,0.675813,0.982998,17,-0.885421
56,JNJ,0.786377,0.988701,4,-0.101162
60,Ford,0.908902,0.887885,9,0.11033
61,Lucid,0.708585,0.999062,5,0.025526


In [21]:
sentiment_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,entity,positive,negative,frequency,score
72,BHP,0.94769,0.73768,4,0.526347
225,amp,0.873486,0.93311,4,0.421837
60,Ford,0.908902,0.887885,9,0.11033
331,&,0.824393,0.990894,5,0.098278
61,Lucid,0.708585,0.999062,5,0.025526
260,NASDAQ,0.878873,0.963999,8,-0.042563
56,JNJ,0.786377,0.988701,4,-0.101162
396,Morgan Stanley,0.77033,0.979895,4,-0.104783
232,Microsoft,0.951678,0.869696,5,-0.141147
550,Alibaba,0.998165,0.933271,5,-0.160697


In [24]:
sentiment_df.sort_values('score', ascending=True).head(10)

Unnamed: 0,entity,positive,negative,frequency,score
246,Bloomberg,0.0,0.998685,7,-0.998685
124,Uber,0.0,0.998583,4,-0.998583
110,China Evergrande Group,0.0,0.998412,5,-0.998412
243,SPY,0.0,0.991321,4,-0.991321
137,reddit,0.0,0.984551,8,-0.984551
410,FB,0.0,0.982966,4,-0.982966
120,BABA,0.0,0.97878,5,-0.97878
294,Investopedia,0.0,0.973442,4,-0.973442
113,the Federal Reserve,0.0,0.942119,4,-0.942119
467,Schwab,0.0,0.934601,4,-0.934601


In [23]:
sentiment_df.to_csv('./data/reddit_investing_ner_sentiment.csv', sep='|', index=False)