# NER and Sentiment

In [2]:
import pandas as pd
import flair

### Loading pre-built distilBERT model from flair library

In [3]:
model = flair.models.TextClassifier.load('en-sentiment')

2022-12-27 23:17:59,842 https://nlp.informatik.hu-berlin.de/resources/models/sentiment-curated-distilbert/sentiment-en-mix-distillbert_4.pt not found in cache, downloading to /tmp/tmpy66hqc68


100%|█████████████████████████| 265512723/265512723 [01:36<00:00, 2753477.64B/s]

2022-12-27 23:19:36,850 copying /tmp/tmpy66hqc68 to cache at /home/darth/.flair/models/sentiment-en-mix-distillbert_4.pt





2022-12-27 23:19:37,054 removing temp file /tmp/tmpy66hqc68
2022-12-27 23:19:37,094 loading file /home/darth/.flair/models/sentiment-en-mix-distillbert_4.pt


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Defining get_sentiment function

In [4]:
def get_sentiment(text):
    # tokenize input text
    sentence = flair.data.Sentence(text)
    # make sentiment prediction
    model.predict(sentence)
    # extract sentiment direction and confidence (label and score) object
    sentiment = sentence.labels[0]
    return sentiment

### Reading data and applying get_sentiment function

In [6]:
# load data
df = pd.read_csv('./data/reddit_investing_ner.csv', sep='|')
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations
0,t3_zwixbx,1672156884,investing,Companies that are focusing on desalination?,"Hi All,\n\nIm wondering if anyone out there kn...",0.5,0,0,0,[]
1,t3_zwivno,1672156783,investing,Legendary investor David Tepper revealed his s...,The only stock he bought was Google (GOOG). He...,0.83,4,0,4,[]
2,t3_zwi6qy,1672154900,investing,Investing for roughly two year window.,"Hello, first allow me to say that my wife and ...",0.2,0,0,0,"['DCA', 'Ammo.com', 'REIT', 'SPLG', 'LETB', 'F..."
3,t3_zwhson,1672153936,investing,ELI5: How can wash sales be used to actually a...,(Edit: My assumption is that they were abused ...,0.43,0,0,0,[]
4,t3_zwhm6b,1672153434,investing,Chipmakers Struggle With Inventory Buildup On ...,https://finance.yahoo.com/news/chipmakers-stru...,0.89,32,0,32,['Micron']


In [7]:
# get sentiment
df['sentiment'] = df['selftext'].apply(get_sentiment)
df.head()

Unnamed: 0,id,created_utc,subreddit,title,selftext,upvote_ratio,ups,downs,score,organizations,sentiment
0,t3_zwixbx,1672156884,investing,Companies that are focusing on desalination?,"Hi All,\n\nIm wondering if anyone out there kn...",0.5,0,0,0,[],"Sentence: ""Hi All , Im wondering if anyone out..."
1,t3_zwivno,1672156783,investing,Legendary investor David Tepper revealed his s...,The only stock he bought was Google (GOOG). He...,0.83,4,0,4,[],"Sentence: ""The only stock he bought was Google..."
2,t3_zwi6qy,1672154900,investing,Investing for roughly two year window.,"Hello, first allow me to say that my wife and ...",0.2,0,0,0,"['DCA', 'Ammo.com', 'REIT', 'SPLG', 'LETB', 'F...","Sentence: ""Hello , first allow me to say that ..."
3,t3_zwhson,1672153936,investing,ELI5: How can wash sales be used to actually a...,(Edit: My assumption is that they were abused ...,0.43,0,0,0,[],"Sentence: ""( Edit : My assumption is that they..."
4,t3_zwhm6b,1672153434,investing,Chipmakers Struggle With Inventory Buildup On ...,https://finance.yahoo.com/news/chipmakers-stru...,0.89,32,0,32,['Micron'],"Sentence: ""https :// finance.yahoo.com / news ..."


### Getting overall sentiment for each organisation

In [8]:
import ast

df['organizations'] = df['organizations'].apply(lambda x: ast.literal_eval(x))

In [9]:
# initialize sentiment dictionary
sentiment = {}

# loop through dataframe and extract org labels and sentiment scores into sentiment dictionary
for i, row in df.iterrows():
    # extract sentiment direction and score
    direction = row['sentiment'].value
    score = row['sentiment'].score
    # loop through each label in organizations column
    for org in row['organizations']:
        # check if org label exists in sentiment dictionary already
        if org not in sentiment.keys():
            # if it doesn't, initialize new entry in dictionary
            sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        # append positive/negative score to respective dictionary entry
        sentiment[org][direction].append(score)

In [10]:
sentiment['ARK']

{'POSITIVE': [],
 'NEGATIVE': [0.9993973970413208, 0.9999769926071167, 0.9995846152305603]}

Now we loop through each organization entry in the sentiment dictionary and calculate an average positive, and average negative score:

In [11]:
# initialize sentiment list
avg_sentiment = []

# loop through each organization
for org in sentiment.keys():
    # get number of positive and negative ratings
    freq = len(sentiment[org]['POSITIVE']) + len(sentiment[org]['NEGATIVE'])
    for direction in ['POSITIVE', 'NEGATIVE']:
        # assign to variable for cleaner code
        score = sentiment[org][direction]
        # if there are no entries, set to 0
        if len(score) == 0:
            sentiment[org][direction] = 0.0
        else:
            # otherwise calculate total
            sentiment[org][direction] = sum(score)
    # now calculate total amount
    total = sentiment[org]['POSITIVE'] - sentiment[org]['NEGATIVE']
    # and the average score
    avg = total/freq
    # add to sentiment list
    avg_sentiment.append({
        'entity': org,
        'positive': sentiment[org]['POSITIVE'],
        'negative': sentiment[org]['NEGATIVE'],
        'frequency': freq,
        'score': avg
    })

In [12]:
sentiment_df = pd.DataFrame(avg_sentiment)
sentiment_df.head()

Unnamed: 0,entity,positive,negative,frequency,score
0,DCA,3.313239,11.312348,16,-0.499944
1,Ammo.com,0.0,0.99204,1,-0.99204
2,REIT,0.0,2.987812,3,-0.995937
3,SPLG,0.795051,0.99204,2,-0.098494
4,LETB,0.0,0.99204,1,-0.99204


In [13]:
sentiment_df = sentiment_df[sentiment_df['frequency'] > 3]
sentiment_df

Unnamed: 0,entity,positive,negative,frequency,score
0,DCA,3.313239,11.312348,16,-0.499944
6,AI,1.996807,2.850134,5,-0.170665
11,FAQ,0.0,49.932495,50,-0.99865
14,JEPI,0.825488,4.85858,6,-0.672182
15,Tesla,1.95804,7.951936,10,-0.59939
16,YoY,0.0,6.991672,7,-0.99881
17,Fidelity,3.84496,27.870648,34,-0.706638
18,treasury,0.0,16.776972,17,-0.986881
21,VOO,2.566655,20.965476,24,-0.766618
24,Twitter,0.0,4.99823,5,-0.999646


In [14]:
sentiment_df.sort_values('score', ascending=False).head(10)

Unnamed: 0,entity,positive,negative,frequency,score
106,Amazon,7.566042,5.882307,14,0.120267
215,IBKR,1.752685,1.999942,4,-0.061814
6,AI,1.996807,2.850134,5,-0.170665
401,Google,1.991441,2.888256,5,-0.179363
405,Apple,3.66836,5.701936,10,-0.203358
193,FCF,1.733203,2.915169,5,-0.236393
223,TD Ameritrade,1.653039,2.989827,5,-0.267358
625,BYD,0.997469,2.420742,4,-0.355818
235,Morningstar,0.96154,2.634294,4,-0.418188
359,HSA,2.175798,6.996149,10,-0.482035
