This project is about NLP sentiment analysis with **Flair**. 
Firstly, we pulled out data from reddit via api, performed text mining and entity recognition (NER), and then chosen 'ORG' for Flair-based sentiment analysis (position or negative), finally extract each of the organizations (entity of ORG) with its sentiment score.

In [1]:
import pandas as pd
import requests
client_id = 'CI5ykwrkfCIoE2qa8-HUYAZ'
secret_token = 'URPD5tlRQUBeGvDW-jXuUMOg9p5jkAM'
auth = requests.auth.HTTPBasicAuth(client_id, secret_token)
username = 'XXXX'
password = 'XXXX'
login = {'grant_type': 'password',
            'username': username,
            'password': password}
headers = {'User-Agent': 'MyBot/0.0.1'}
res = requests.post(f'https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=login, headers=headers)
token = res.json()['access_token']
headers['Authorization'] = f'bearer {token}'
api = 'https://oauth.reddit.com'
res = requests.get(f'{api}/r/investing/new', headers=headers, params = {'limit':'100'})
# res.json()

In [2]:
df = pd.DataFrame()
for content in res.json()['data']['children']:
    df = df.append({
        'id': content['data']['name'],
        'created_utc': int(content['data']['created_utc']),
        'subreddit': content['data']['subreddit'],
        'title': content['data']['title'],
        'selftext': content['data']['selftext'],
        'upvote_ratio': content['data']['upvote_ratio'],
        'ups': content['data']['ups'],
        'downs': content['data']['downs'],
        'score': content['data']['score']
    }, ignore_index=True)
data = df.replace({'|': ''}, regex=True)
data.to_csv(f'./data/reddit_investing.csv', sep='|', index=False)

In [7]:
df1 = pd.read_csv('./data/reddit_investing.csv', sep='|')
df1.iloc[5:10,:]

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio
5,1614276000.0,0.0,t3_lscgjn,1.0,"I'm still bullish on eBay, and I'm tempted to ...",investing,Bearish Arguments For eBay,1.0,0.6
6,1614271000.0,0.0,t3_lsat2y,118.0,Coinbase Global made public on Thursday the pr...,investing,Coinbase Could Go Public as Soon as This March,118.0,0.96
7,1614269000.0,0.0,t3_ls9vhs,7.0,I've been teaching myself a lot about investin...,investing,Do you model before investing?,7.0,0.82
8,1614268000.0,0.0,t3_ls9lct,23.0,Historically gold has always functioned as a h...,investing,What happened to gold as equity market hedge,23.0,0.82
9,1614267000.0,0.0,t3_ls9a7e,37.0,https://www.reuters.com/article/us-usa-economy...,investing,US Weekly Jobless Claims Fall More Than Expect...,37.0,0.91


In [8]:
## NER analysis for text (SpaCy)
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_sm')
def collect_orgs(text):
    doc = nlp(text)
    orgs_res = []
    for entity in doc.ents:                # remove non-organization labels
        if entity.label_ == 'ORG' and entity.text.lower() not in ['ev', 'covid', 'etf', 'nyse', 'sec', 'spac', 'fda']:   
            orgs_res.append(entity.text)
    res = list(set(orgs_res))
    return res
df1['organizations'] = df1['selftext'].apply(collect_orgs)
df1.iloc[5:10,:]

Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations
5,1614276000.0,0.0,t3_lscgjn,1.0,"I'm still bullish on eBay, and I'm tempted to ...",investing,Bearish Arguments For eBay,1.0,0.6,"[eBay, CMA, Adevinta]"
6,1614271000.0,0.0,t3_lsat2y,118.0,Coinbase Global made public on Thursday the pr...,investing,Coinbase Could Go Public as Soon as This March,118.0,0.96,"[J.P. Morgan Securities, Allen &, Citigroup Gl..."
7,1614269000.0,0.0,t3_ls9vhs,7.0,I've been teaching myself a lot about investin...,investing,Do you model before investing?,7.0,0.82,[]
8,1614268000.0,0.0,t3_ls9lct,23.0,Historically gold has always functioned as a h...,investing,What happened to gold as equity market hedge,23.0,0.82,[]
9,1614267000.0,0.0,t3_ls9a7e,37.0,https://www.reuters.com/article/us-usa-economy...,investing,US Weekly Jobless Claims Fall More Than Expect...,37.0,0.91,"[the Labor Department, the U.S., the Commerce ..."


In [9]:
## Freqs of entity (ORG)
orgs_Lists = df1['organizations'].to_list()
orgs = [org for new in orgs_Lists for org in new]
from collections import Counter
org_nums = Counter(orgs)
org_nums.most_common(5)

[('GME', 32), ('Amazon', 24), ('EPS', 22), ('Apple', 22), ('ARK', 19)]

In [10]:
# Flair Sentiment 
import pandas as pd
import flair
model = flair.models.TextClassifier.load('en-sentiment')
def sentiment_func(text):
    sentence = flair.data.Sentence(text)        # tokenization  
    model.predict(sentence)
    return sentence.labels[0]
df1['sentiment'] = df1['selftext'].apply(sentiment_func)
df1.iloc[5:10,:]

2022-02-02 10:42:31,619 loading file C:\Users\Sealion\.flair\models\sentiment-en-mix-distillbert_4.pt


Unnamed: 0,created_utc,downs,id,score,selftext,subreddit,title,ups,upvote_ratio,organizations,sentiment
5,1614276000.0,0.0,t3_lscgjn,1.0,"I'm still bullish on eBay, and I'm tempted to ...",investing,Bearish Arguments For eBay,1.0,0.6,"[eBay, CMA, Adevinta]",NEGATIVE (0.9998)
6,1614271000.0,0.0,t3_lsat2y,118.0,Coinbase Global made public on Thursday the pr...,investing,Coinbase Could Go Public as Soon as This March,118.0,0.96,"[J.P. Morgan Securities, Allen &, Citigroup Gl...",NEGATIVE (0.9976)
7,1614269000.0,0.0,t3_ls9vhs,7.0,I've been teaching myself a lot about investin...,investing,Do you model before investing?,7.0,0.82,[],NEGATIVE (0.9464)
8,1614268000.0,0.0,t3_ls9lct,23.0,Historically gold has always functioned as a h...,investing,What happened to gold as equity market hedge,23.0,0.82,[],NEGATIVE (0.9999)
9,1614267000.0,0.0,t3_ls9a7e,37.0,https://www.reuters.com/article/us-usa-economy...,investing,US Weekly Jobless Claims Fall More Than Expect...,37.0,0.91,"[the Labor Department, the U.S., the Commerce ...",NEGATIVE (0.9494)


In [15]:
## Extract each of the organizations alongside it's sentiment score.
import ast
from ast import literal_eval
df1['organizations'] = df1['organizations'].apply(lambda x: ast.literal_eval(str(x)))   # convert to a list
total_sentiment = {}
for i, row in df1.iterrows():
    value_org = row['sentiment'].value
    score = row['sentiment'].score
    for org in row['organizations']:
        if org not in total_sentiment.keys():
            total_sentiment[org] = {'POSITIVE': [], 'NEGATIVE': []}
        total_sentiment[org][value_org].append(score)
total_sentiment

{'St. Louis Fed': {'POSITIVE': [], 'NEGATIVE': [0.9916453957557678]},
 'Federal Reserve': {'POSITIVE': [0.981378436088562],
  'NEGATIVE': [0.9916453957557678, 0.9440180659294128]},
 'Fed Chair': {'POSITIVE': [], 'NEGATIVE': [0.9916453957557678]},
 'the Atlanta Fed’s': {'POSITIVE': [], 'NEGATIVE': [0.9916453957557678]},
 'Powell’s': {'POSITIVE': [], 'NEGATIVE': [0.9916453957557678]},
 'Fed': {'POSITIVE': [0.6107646226882935,
   0.9688535332679749,
   0.7470293641090393],
  'NEGATIVE': [0.9916453957557678,
   0.9998539686203003,
   0.9999926090240479,
   0.9991708993911743,
   0.9998053908348083,
   0.9999523162841797,
   0.9440180659294128,
   0.9999403953552246,
   0.9994103908538818,
   0.9993396401405334,
   0.9999759197235107]},
 'Treasury': {'POSITIVE': [0.6107646226882935, 0.9857498407363892],
  'NEGATIVE': [0.9916453957557678,
   0.9998053908348083,
   0.9999403953552246,
   0.9993396401405334]},
 'Bostic': {'POSITIVE': [], 'NEGATIVE': [0.9916453957557678]},
 'ARK': {'POSITIVE': 

In [16]:
# For average scores
avg_sentiment = []
for org in total_sentiment.keys():
    freq = len(total_sentiment[org]['POSITIVE']) + len(total_sentiment[org]['NEGATIVE'])
    for ind in ['POSITIVE', 'NEGATIVE']:
        score = total_sentiment[org][ind]
        if len(score) == 0:
            total_sentiment[org][ind] = 0.0
        else:
            total_sentiment[org][ind] = sum(score)
    total = total_sentiment[org]['POSITIVE'] - total_sentiment[org]['NEGATIVE']
    avg = total/freq
    avg_sentiment.append({
        'entity': org,
        'positive': total_sentiment[org]['POSITIVE'],
        'negative': total_sentiment[org]['NEGATIVE'],
        'frequency': freq,
        'score': avg
    })
sentiment_df = pd.DataFrame(avg_sentiment)
sentiment_df.sort_values('score', ascending=False).head(10)
sentiment_df.head()

Unnamed: 0,entity,positive,negative,frequency,score
0,St. Louis Fed,0.0,0.991645,1,-0.991645
1,Federal Reserve,0.981378,1.935663,3,-0.318095
2,Fed Chair,0.0,0.991645,1,-0.991645
3,the Atlanta Fed’s,0.0,0.991645,1,-0.991645
4,Powell’s,0.0,0.991645,1,-0.991645


Refs:

(1). NLP (Udemy); 

(2). Kaggle;

(3). Reddit.